From eb3a03c4552edfc6b82521fd75c81390236ab02b Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 12 Jan 2022 10:51:11 -0800 Subject: [PATCH 01/69] Rewrite accelerator_connector --- pytorch_lightning/strategies/ddp.py | 7 +- pytorch_lightning/strategies/ddp2.py | 11 +- pytorch_lightning/strategies/ddp_spawn.py | 7 +- pytorch_lightning/strategies/deepspeed.py | 2 +- .../connectors/accelerator_connector_new.py | 569 ++++++++++++++++++ pytorch_lightning/trainer/trainer.py | 92 +-- pytorch_lightning/utilities/exceptions.py | 6 + pytorch_lightning/utilities/imports.py | 2 + 8 files changed, 646 insertions(+), 50 deletions(-) create mode 100644 pytorch_lightning/trainer/connectors/accelerator_connector_new.py diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index feff575719ad4..fac1cbe2dc288 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -75,7 +75,7 @@ class DDPStrategy(ParallelStrategy): devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes. """ - distributed_backend = _StrategyType.DDP + distributed_backend = "ddp" def __init__( self, @@ -428,6 +428,11 @@ def register_strategies(cls, strategy_registry: Dict) -> None: description="DDP Strategy with `find_unused_parameters` as False", find_unused_parameters=False, ) + strategy_registry.register( + cls.distributed_backend, + cls, + description="Strategy", + ) def _should_run_deadlock_detection(self) -> bool: """Determines whether the plugin will perform process reconciliation in case of errors. diff --git a/pytorch_lightning/strategies/ddp2.py b/pytorch_lightning/strategies/ddp2.py index 9bde0f67e1b1a..5e1a349bd910d 100644 --- a/pytorch_lightning/strategies/ddp2.py +++ b/pytorch_lightning/strategies/ddp2.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch +from typing import Dict from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -22,7 +23,7 @@ class DDP2Strategy(DDPStrategy): """DDP2 behaves like DP in one node, but synchronization across nodes behaves like in DDP.""" - distributed_backend = _StrategyType.DDP2 + distributed_backend = "ddp2" @property def global_rank(self) -> int: @@ -73,3 +74,11 @@ def set_world_ranks(self) -> None: return self.cluster_environment.set_global_rank(self.node_rank) self.cluster_environment.set_world_size(self.num_nodes) + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + cls.distributed_backend, + cls, + description="Strategy", + ) diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 03407e1c14232..501fb018a0fca 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -53,7 +53,7 @@ class DDPSpawnStrategy(ParallelStrategy): """Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes.""" - distributed_backend = _StrategyType.DDP_SPAWN + distributed_backend = "ddp_spawn" def __init__( self, @@ -367,6 +367,11 @@ def register_strategies(cls, strategy_registry: Dict) -> None: description="DDPSpawn Strategy with `find_unused_parameters` as False", find_unused_parameters=False, ) + strategy_registry.register( + cls.distributed_backend, + cls, + description="Strategy", + ) def teardown(self) -> None: super().teardown() diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py index fa9c4d5376ff8..530ede34ec899 100644 --- a/pytorch_lightning/strategies/deepspeed.py +++ b/pytorch_lightning/strategies/deepspeed.py @@ -82,7 +82,7 @@ def _move_float_tensors_to_half(self, batch: Any): class DeepSpeedStrategy(DDPStrategy): - distributed_backend = _StrategyType.DEEPSPEED + distributed_backend = "deepspeed" DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH" def __init__( diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector_new.py b/pytorch_lightning/trainer/connectors/accelerator_connector_new.py new file mode 100644 index 0000000000000..186d175f33d64 --- /dev/null +++ b/pytorch_lightning/trainer/connectors/accelerator_connector_new.py @@ -0,0 +1,569 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from typing import List, Optional, Sequence, Union +from weakref import proxy + +import torch + +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.cpu import CPUAccelerator +from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.ipu import IPUAccelerator +from pytorch_lightning.accelerators.tpu import TPUAccelerator +from pytorch_lightning.plugins import ( + ApexMixedPrecisionPlugin, + CheckpointIO, + DeepSpeedPrecisionPlugin, + DoublePrecisionPlugin, + FullyShardedNativeMixedPrecisionPlugin, + IPUPrecisionPlugin, + NativeMixedPrecisionPlugin, + PrecisionPlugin, + ShardedNativeMixedPrecisionPlugin, + TPUBf16PrecisionPlugin, + TPUPrecisionPlugin, +) +from pytorch_lightning.plugins.environments import ( + ClusterEnvironment, + KubeflowEnvironment, + LightningEnvironment, + LSFEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, +) +from pytorch_lightning.strategies import ( + DataParallelStrategy, + DDP2Strategy, + DDPFullyShardedStrategy, + DDPShardedStrategy, + DDPSpawnShardedStrategy, + DDPSpawnStrategy, + DDPStrategy, + DeepSpeedStrategy, + HorovodStrategy, + IPUStrategy, + SingleDeviceStrategy, + SingleTPUStrategy, + Strategy, + StrategyRegistry, + TPUSpawnStrategy, +) +from pytorch_lightning.utilities import ( + _AcceleratorType, + _StrategyType, + AMPType, + device_parser, + rank_zero_deprecation, + rank_zero_info, + rank_zero_warn, +) +from pytorch_lightning.utilities.enums import PrecisionType +from pytorch_lightning.utilities.exceptions import MisconfigurationException, DeviceNotAvailibleException, ImpactableConfigurationException +from pytorch_lightning.utilities.imports import ( + _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, + _GPU_AVAILABLE, + _TORCH_GREATER_EQUAL_1_8, + _TPU_AVAILABLE, +) + +if _HOROVOD_AVAILABLE: + import horovod.torch as hvd + +log = logging.getLogger(__name__) + + +class AcceleratorConnector: + def __init__( + self, + devices, + num_nodes, + accelerator, # reduce typing + strategy: Optional[Union[str, Strategy]], + plugins, + precision, + amp_type, + amp_level, + sync_batchnorm, + benchmark, + replace_sampler_ddp, + deterministic: bool, + num_processes, # deprecated + tpu_cores, # deprecated + ipus, # deprecated + gpus, # deprecated + gpu_ids, + ): + """ + A. accelerator could be: + 1. strategy class (deprecated in 1.5 will be removed in 1.7) + 2. strategy str (deprecated in 1.5 will be removed in 1.7) + 3. accelerator class + 4. accelerator str + 5. accelerator auto + + B. strategy could be : + 1. strategy class + 2. strategy str registered with strategyRegister + 3. strategy str in _strategy_type enum which listed in each strategy as backend (registed these too, and _strategy_type could be deprecated) + + C. plugins could be: + 1. List of str, which could contains: + i. strategy str + ii. precision str (Not supported in the old accelerator_connector version) + iii. checkpoint_io str (Not supported in the old accelerator_connector version) + iv. cluster_environment str (Not supported in the old accelerator_connector version) + 2. List of class, which could contains: + i. strategy class (deprecated in 1.5 will be removed in 1.7) + ii. precision class (should be removed, and precision flag should allow user pass classes) + iii. checkpoint_io class + iv. cluster_environment class + + + priorities which to take when: + A. Class > str + B. Strategy > Accelerator/precision/plugins + C. When multiple flag set to the same thing? (ignore? not handled for now) + + """ + + # Get registered strategies, existing accelerators and precision plugins + self._existing_strategies_str = StrategyRegistry.available_strategies() + print(self._existing_strategies_str) + self._existing_accelerator_type = ["tpu", "ipu", "gpu", "cpu"] + self._supported_precision = PrecisionType.supported_types() + + # raise misconfig exceptions if their is conflict between flags + # set the valid flag to self._x_flag after validation + # for example: if accelerator is strategy class, set self._strategy_flag = accelerator + # for devices: assign gpus ipus and etcs to accelerator_flag and devices_flag + self._config_check_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) + self._device_config_check_and_set_final_flags(devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores) + + # handle auto and choose flag when user hasn't set it up. + if self._accelerator_flag == 'auto' or self._accelerator_flag is None: + self._choose_accelerator() + else: + # [RFC] move to XAccelerator class init? + self._check_device_availibility() + + # Accelerator initialization + # TODO devices logic handling still in process, not ready for reviews + self._set_parallel_devices_and_init_accelerator() + + # handle strategy flag is not set, choose for user + if self._strategy_flag is None: + self._choose_strategy() + + self._choose_and_init_cluster_environment() + self._check_capatibility_and_init_precision() + self._init_strategy() + + + def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): + """ + This method checks: + 1. strategy flag: strategy, accelerator and plugin can all set strategies + 2. accelerator: if accelerator flag is Accelerator related flag or class, set self._acceelrator_flag; + If accelerator is strategy related, logic handled in 1 above + 3. precision could be set by precision and plugins flag + 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), set checkpoint_io and cluster_environment + """ + self._strategy_flag, self._accelerator_flag, self._precision_flag, self._cluster_environment, self.checkpoint_io, self._amp_level_flag, self._amp_type_flag = None, None, None, None, None, amp_type, amp_level + if strategy: + self._strategy_flag = strategy + # handle duplications and conflict + if isinstance(accelerator, Strategy) and strategy != accelerator: + raise MisconfigurationException("strategy already set through strategy flag, duplicated in accelerator") + if isinstance(accelerator, str) and accelerator in self._existing_strategies_str and strategy != accelerator: + raise MisconfigurationException("strategy str already set through strategy flag, duplicated in accelerator") + if plugins: + for plugin in plugins: + if isinstance(plugin, Strategy) and strategy != plugin: + raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") + if isinstance(plugin, str) and plugin in self._existing_strategies_str: + raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") + + + if accelerator in self._existing_accelerator_type or accelerator=="auto" or isinstance(accelerator, Accelerator): + self._accelerator_flag = accelerator + elif accelerator in self._existing_strategies_str or isinstance(accelerator, Strategy): + rank_zero_deprecation( + f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" + f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." + ) + self._strategy_flag = accelerator + + + if precision: + self._precision_flag = precision + # handle duplications and conflict + if plugins: + for plugin in plugins: + if isinstance(plugin, PrecisionPlugin): + raise MisconfigurationException("precision set in both precision flag and plugin flag") + + if plugins: + for plugin in plugins: + if isinstance(plugin, Strategy) or isinstance(plugin, str) and plugin in self._existing_strategies_str: + self._strategy_flag = plugin + elif isinstance(plugin, PrecisionPlugin) or isinstance(plugin, str) and plugin in self._supported_precision: + self._precision_flag = plugin + elif isinstance(plugin, CheckpointIO): + self.checkpoint_io = plugin + elif isinstance(plugin, ClusterEnvironment): + self._cluster_environment = plugin + else: + raise MisconfigurationException(f"Does not recognize flag {plugin}") + + + # if user pass in a strategy class which has accelerator, precision, checkpoint or cluster env set up + if self._strategy_flag and isinstance(self._strategy_flag, Strategy): + if self._strategy_flag.accelerator: + if self._accelerator_flag: + raise MisconfigurationException("accelerator set through both strategy class and accelerator flag, choose one") + else: + self._accelerator_flag = self._strategy_flag.accelerator + if self._strategy_flag.precision_plugin: + # precision has default value 32, we can not tell whether user set it or not [RFC] remove default from trainer? + # if self._precision_flag: + # raise MisconfigurationException("precision set through both strategy class and flags, choose one place to set") + # else: + self._precision_flag = self._strategy_flag.precision_plugin + if self._strategy_flag.checkpoint_io: + if self.checkpoint_io: + raise MisconfigurationException("checkpoint_io set through both strategy class and plugins, choose one") + else: + self.checkpoint_io = self._strategy_flag.checkpoint_io + if getattr(self._strategy_flag, "cluster_environment", None): + if self._cluster_environment: + raise MisconfigurationException("cluster_environment set through both strategy class and plugins, choose one") + else: + self._cluster_environment = getattr(self._strategy_flag, "cluster_environment") + + + amp_type = amp_type.lower() if isinstance(amp_type, str) else None + self._amp_type_flag = AMPType.from_str(amp_type) if amp_type is not None else None + + # TODO still working on these flags + # if amp_level is not None and self._amp_type_flag != AMPType.APEX: + # raise MisconfigurationException( + # f"You have asked for `amp_level={self._amp_level_flag!r}` but it's only supported with `amp_backend='apex'`." + # ) + self._amp_level_flag = amp_level + + + def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores): + if num_nodes == "auto": + self._num_nodes_flag = 1 + else : + self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 + + ##### to be deleted v1.7 + deprecated_devices_specific_nums = num_processes or gpus or ipus or tpu_cores + self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(devices, deprecated_devices_specific_nums, num_processes, gpus, ipus, tpu_cores) + ##### deleted end + if devices == "auto": + if self._accelerator_flag is None: + raise MisconfigurationException( + f"You passed `devices={devices}` but haven't specified" + " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" + ) + if not self._device_flag: + self._device_flag = devices + + + + def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(self, devices, deprecated_devices_specific_nums, num_processes, gpus, ipus, tpu_cores): + ##### to be deleted v1.7vbg + # set devices base on num_processes, gpus, ipus, tpu_cores + if devices: + rank_zero_warn(f"will be ignored, instand the device specific number {deprecated_devices_specific_nums} will be used") + if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count(True) > 1: + rank_zero_warn(f"more than one device specifc flag has been set") + self._device_flag = deprecated_devices_specific_nums + + if not self._accelerator_flag: + # set accelerator type base on num_processes, gpus, ipus, tpu_cores + if num_processes: + self._accelerator_flag = "cpu" + if gpus: + self._accelerator_flag = "gpu" + if tpu_cores: + self._accelerator_flag = "tpu" + if ipus: + self._accelerator_flag = "ipu" + #### delete end + + def _choose_accelerator(self): + if self._accelerator_flag == "auto": + if _TPU_AVAILABLE: + self._accelerator_flag = "tpu" + elif _IPU_AVAILABLE: + self._accelerator_flag = "ipu" + elif _GPU_AVAILABLE: + self._accelerator_flag = "gpu" + else: + self._accelerator_flag = "cpu" + # [RFC] this is current logic, if accelerator not set, default cpu? + else: + self._accelerator_flag = "cpu" + + + def _check_device_availibility(self): + for accelerator_flag, available in zip(self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True]): + if self._accelerator_flag == accelerator_flag: + if not available: + raise DeviceNotAvailibleException(f"{accelerator_flag} not avalible") + + # TODO in progress for setting up devices + def _set_parallel_devices_and_init_accelerator(self): + self._parallel_devices = [] + + if isinstance(self._accelerator_flag, Accelerator): + self.accelerator = self._accelerator_flag() + elif self._accelerator_flag == "tpu": + self.accelerator = TPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = TPUAccelerator.auto_device_count() + if isinstance(self._device_flag, int): + self._parallel_devices = list(range(self._device_flag)) + + elif self._accelerator_flag == "ipu": + self.accelerator = IPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = IPUAccelerator.auto_device_count() + if isinstance(self._device_flag, int): + self._parallel_devices = list(range(self._device_flag)) + + elif self._accelerator_flag == "gpu": + self.accelerator = GPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = GPUAccelerator.auto_device_count() + if isinstance(self._device_flag, int): + self._parallel_devices = [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] + + elif self._accelerator_flag == "cpu": + self.accelerator = CPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = CPUAccelerator.auto_device_count() + if isinstance(self._device_flag, int): + self._parallel_devices = [torch.device("cpu")] * self._device_flag + + + def _choose_and_init_cluster_environment(self): + self.cluster_environment = LightningEnvironment() + if isinstance(self._cluster_environment, ClusterEnvironment): + self.cluster_environment = self._cluster_environment + elif self._is_slurm_managing_tasks(): + rank_zero_info("Multiprocessing is handled by SLURM.") + self.cluster_environment = SLURMEnvironment() + else: + for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): + if env_type.detect(): + self.cluster_environment = env_type() + + + def _is_slurm_managing_tasks(self): + """ + used by choosing cluster enviroment + """ + if ( + (not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") + or not SLURMEnvironment.detect() + or SLURMEnvironment.job_name() == "bash" # in interactive mode we don't manage tasks + ): + return False + + total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag + num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) + return num_slurm_tasks == total_requested_devices + + def _choose_strategy(self): + if _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): + self._strategy_flag = HorovodStrategy() + + if self._accelerator_flag == "ipu": + self._strategy_flag = IPUStrategy() + elif self._accelerator_flag == "tpu": + if self._parallel_devices and len(self._parallel_devices)>1: + self._strategy_flag = TPUSpawnStrategy() + else: + self._srategy_flag = SingleTPUStrategy() + + # [RFC] in existing logic SingleDevice strategy choice diverge between cpu and gpu, should we merge? + elif self._accelerator_flag == "gpu": + if self._num_nodes_flag > 1: + self._strategy_flag = DDPStrategy() + elif len(self._parallel_devices) == 1: + self._strategy_flag = DDPStrategy() + elif len(self._parallel_devices) > 1: + self._strategy_flag = DDPSpawnStrategy() + else: + self._strategy_flag = DDPStrategy() + else: + if self._num_nodes_flag > 1: + self._strategy_flag = DDPStrategy() + elif len(self._parallel_devices) <= 1: + device = torch.device("cuda") if self._accelerator_flag == "gpu" else "cpu" + self._strategy_flag = SingleDeviceStrategy(device = device) + elif len(self._parallel_devices) > 1: + self._strategy_flag = DDPSpawnStrategy() + else: + self._strategy_flag = DDPStrategy() + + + def _check_capatibility_and_init_precision(self): + self._precision_misconfig_check() + if isinstance(self._precision_flag, PrecisionPlugin): + self.precision_plugin = self._precision_flag + + if self._accelerator_flag =="ipu": + self.precision_plugin = IPUPrecisionPlugin(self._precision_flag) + if self._accelerator_flag == "tpu": + if self._precision_flag == 32: + self.precision_plugin = TPUPrecisionPlugin() + elif self._precision_flag in (16, "bf16"): + if self._precision_flag == 16: + # this is not deprecated to ease transition between accelerator environments + rank_zero_warn( + f"You passed `Trainer(accelerator='tpu', precision=16)` but {self._amp_type_flag.value} AMP" + f" is not supported with TPUs. Using `precision='bf16'` instead." + ) + self.precision_plugin = TPUBf16PrecisionPlugin() + if self._strategy_flag == "deepspeed" or isinstance(self._strategy_flag, DeepSpeedStrategy): + self.precision_plugin = DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) + + if self._precision_flag == 32: + self.precision_plugin = PrecisionPlugin() + if self._precision_flag == 64: + self.precision_plugin = DoublePrecisionPlugin() + + # maybe convert the precision value + if self._precision_flag == 16 and self._accelerator_flag == "cpu": + # this automatic switch is to ease transition between accelerator environments + rank_zero_warn( + "You passed `Trainer(accelerator='cpu', precision=16)` but native AMP is not supported on CPU." + " Using `precision='bf16'` instead." + ) + self._precision_flag = "bf16" + + if self._precision_flag in (16, "bf16"): + rank_zero_info( + f"Using 16bit {self._amp_type_flag.value} Automatic Mixed Precision (AMP)" + if self._precision_flag == 16 + else "Using bfloat16 Automatic Mixed Precision (AMP)" + ) + + if self._amp_type_flag == AMPType.NATIVE: + device = "cpu" if self._accelerator_flag=="cpu" else "cuda" + + # TODO in progress implement the two following shard types + # if self._is_sharded_training_type: + # return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + # if self._is_fully_sharded_training_type: + # return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + # return NativeMixedPrecisionPlugin(self._precision_flag, device) + + + self._amp_level_flag = self._amp_level_flag or "O2" + self.precision_plugin = ApexMixedPrecisionPlugin(self._amp_level_flag) + self.precision_plugin = PrecisionPlugin() + + def _precision_misconfig_check(self): + if self._accelerator_flag == "ipu": + if self._precision_flag not in (16, 32): + raise MisconfigurationException( + f"`Trainer(accelerator='ipu', precision={self._precision_flag!r})` is not supported." + ) + if self._accelerator_flag == "tpu" and self._precision_flag == 64: + raise MisconfigurationException( + "`Trainer(accelerator='tpu', precision=64)` is not implemented." + " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " requesting this feature." + ) + if self._precision_flag == 16 and self._accelerator_flag == "cpu" and self._amp_type_flag == AMPType.APEX: + # apex was explicitly passed, not a good idea to silently switch to native AMP + raise MisconfigurationException( + "You passed `Trainer(accelerator='cpu', precision=16, amp_type='apex')`" + " but apex AMP not supported on CPU." + ) + if self._precision_flag == "bf16" and self._amp_type_flag != AMPType.NATIVE: + raise MisconfigurationException( + f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but it's not supported." + " Try using `amp_type='native'` instead." + ) + + # if self._precision_flag in (16, "bf16") and self._amp_type_flag == AMPType.APEX: + # if self._is_sharded_training_type or self._is_fully_sharded_training_type: + # raise MisconfigurationException( + # "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`." + # ) + + + def _init_strategy(self): + if isinstance(self._strategy_flag, str): + self.strategy = StrategyRegistry.get(self._strategy_flag) + else: + self.strategy = self._strategy_flag + self.strategy.accelerator = self.accelerator + if self.precision_plugin: + self.strategy.precision_plugin = self.precision_plugin + if self.checkpoint_io: + self.strategy.checkpoint_io = self.checkpoint_io + self.strategy.cluster_environment = self.cluster_environment + + + + + + ############################################################################## + # the following logic should be deprecated/removed + # Added here to keep backward compabilities + + # @property + # def parallel_devices(self) -> List[Union[torch.device, int]]: + # return self._parallel_device + + # @property + # def replace_sampler_ddp(): + # return self.replace_sampler_ddp + + # def _distrib_type(): + + # def _device_type(): + + # def num_nodes(): + + # def num_processes(): + + # def root_gpu(): + + def devices(self): + return len(self._parallel_devices) + + # def parallel_device_ids(): + + # def gpus(): + + # def is_distributed(): + + def has_ipu(self): + return self._accelerator_flag == "ipu" + + def has_tpu(self): + return self._accelerator_flag == "tpu" diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b6a0d7fa452e0..bb3dde1e893a3 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -138,7 +138,7 @@ def __init__( gradient_clip_algorithm: Optional[str] = None, process_position: int = 0, num_nodes: int = 1, - num_processes: int = 1, + num_processes: int = None, devices: Optional[Union[List[int], str, int]] = None, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, @@ -435,23 +435,23 @@ def __init__( self._data_connector = DataConnector(self, multiple_trainloader_mode) self._accelerator_connector = AcceleratorConnector( - num_processes, - devices, - tpu_cores, - ipus, - accelerator, - strategy, - gpus, - gpu_ids, - num_nodes, - sync_batchnorm, - benchmark, - replace_sampler_ddp, - deterministic, - precision, - amp_backend, - amp_level, - plugins, + num_processes = num_processes, + devices = devices, + tpu_cores = tpu_cores, + ipus = ipus, + accelerator = accelerator, + strategy = strategy, + gpus = gpus, + gpu_ids = gpu_ids, + num_nodes = num_nodes, + sync_batchnorm = sync_batchnorm, + benchmark = benchmark, + replace_sampler_ddp = replace_sampler_ddp, + deterministic = deterministic, + precision = precision, + amp_type = amp_backend, + amp_level = amp_level, + plugins = plugins, ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self._callback_connector = CallbackConnector(self) @@ -636,7 +636,7 @@ def _determine_data_use_amount(self, overfit_batches: float) -> None: self.limit_val_batches = 0 def _setup_on_init(self, num_sanity_val_steps: int) -> None: - self._log_device_info() + # self._log_device_info() self.should_stop = False self.state = TrainerState() @@ -1968,45 +1968,45 @@ def should_rank_save_checkpoint(self) -> bool: isinstance(strategy, pl.strategies.TPUSpawnStrategy) and strategy.local_rank == 0 or strategy.is_global_zero ) - @property - def _strategy_type(self) -> _StrategyType: - return self._accelerator_connector._strategy_type + # @property + # def _strategy_type(self) -> _StrategyType: + # return self._accelerator_connector._strategy_type - @property - def _device_type(self) -> _AcceleratorType: - return self._accelerator_connector._device_type + # @property + # def _device_type(self) -> _AcceleratorType: + # return self._accelerator_connector._device_type - @property - def num_nodes(self) -> int: - return self._accelerator_connector.num_nodes + # @property + # def num_nodes(self) -> int: + # return self._accelerator_connector.num_nodes - @property - def num_processes(self) -> int: - return self._accelerator_connector.num_processes + # @property + # def num_processes(self) -> int: + # return self._accelerator_connector.num_processes - @property - def root_gpu(self) -> Optional[int]: - return self._accelerator_connector.root_gpu + # @property + # def root_gpu(self) -> Optional[int]: + # return self._accelerator_connector.root_gpu - @property - def tpu_cores(self) -> int: - return self._accelerator_connector.tpu_cores + # @property + # def tpu_cores(self) -> int: + # return self._accelerator_connector.tpu_cores - @property - def ipus(self) -> int: - return self._accelerator_connector.num_ipus + # @property + # def ipus(self) -> int: + # return self._accelerator_connector.num_ipus - @property - def num_gpus(self) -> int: - return self._accelerator_connector.num_gpus + # @property + # def num_gpus(self) -> int: + # return self._accelerator_connector.num_gpus @property def devices(self) -> Optional[Union[List[int], str, int]]: return self._accelerator_connector.devices - @property - def data_parallel_device_ids(self) -> Optional[List[int]]: - return self._accelerator_connector.parallel_device_ids + # @property + # def data_parallel_device_ids(self) -> Optional[List[int]]: + # return self._accelerator_connector.parallel_device_ids @property def lightning_module(self) -> "pl.LightningModule": diff --git a/pytorch_lightning/utilities/exceptions.py b/pytorch_lightning/utilities/exceptions.py index ece4629819b33..24fbbac44d156 100644 --- a/pytorch_lightning/utilities/exceptions.py +++ b/pytorch_lightning/utilities/exceptions.py @@ -16,6 +16,12 @@ class MisconfigurationException(Exception): """Exception used to inform users of misuse with PyTorch Lightning.""" +class DeviceNotAvailibleException(Exception): + """Exception used to inform users that requested devices are not availible.""" + +class ImpactableConfigurationException(Exception): + """Exception used to inform users that configuration impactable with each other.""" + class DeadlockDetectedException(Exception): """Exception used when a deadlock has been detected and processes are being killed.""" diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 6c20d90e01646..602c8b50c92e9 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -133,6 +133,8 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: else: _IPU_AVAILABLE = False +_GPU_AVAILABLE = torch.cuda.is_available() and torch.cuda.device_count()>0 + # experimental feature within PyTorch Lightning. def _fault_tolerant_training() -> bool: From 50a82d2f7c4b2960a15dec00b4bbf061e678fab0 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 24 Jan 2022 17:09:08 -0800 Subject: [PATCH 02/69] update --- pytorch_lightning/strategies/ddp2.py | 2 +- pytorch_lightning/strategies/ddp_spawn.py | 4 + pytorch_lightning/strategies/dp.py | 13 +- pytorch_lightning/strategies/fully_sharded.py | 8 +- pytorch_lightning/strategies/horovod.py | 12 +- pytorch_lightning/strategies/ipu.py | 12 +- pytorch_lightning/strategies/parallel.py | 8 + pytorch_lightning/strategies/sharded.py | 7 +- pytorch_lightning/strategies/sharded_spawn.py | 7 +- pytorch_lightning/strategies/single_device.py | 13 +- pytorch_lightning/strategies/single_tpu.py | 11 +- pytorch_lightning/strategies/strategy.py | 7 +- pytorch_lightning/strategies/tpu_spawn.py | 8 + .../connectors/accelerator_connector_new.py | 231 +++++++++++++----- pytorch_lightning/trainer/trainer.py | 54 ++-- 15 files changed, 297 insertions(+), 100 deletions(-) diff --git a/pytorch_lightning/strategies/ddp2.py b/pytorch_lightning/strategies/ddp2.py index 5e1a349bd910d..2633508e6bd82 100644 --- a/pytorch_lightning/strategies/ddp2.py +++ b/pytorch_lightning/strategies/ddp2.py @@ -80,5 +80,5 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.distributed_backend, cls, - description="Strategy", + description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 501fb018a0fca..2e73c64a1b207 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -87,6 +87,10 @@ def __init__( def num_nodes(self) -> int: return self._num_nodes + @property + def num_processes(self): + return len(self.parallel_devices) if self.parallel_devices is not None else 0 + @num_nodes.setter def num_nodes(self, num_nodes: int) -> None: # note that world ranks is related to num_nodes, when resetting it, need to reset world ranks diff --git a/pytorch_lightning/strategies/dp.py b/pytorch_lightning/strategies/dp.py index 0c9723c183a5e..bcac4f4f156d5 100644 --- a/pytorch_lightning/strategies/dp.py +++ b/pytorch_lightning/strategies/dp.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List, Optional +from typing import Any, List, Optional, Dict import torch from torch.nn import DataParallel, Module @@ -31,7 +31,7 @@ class DataParallelStrategy(ParallelStrategy): """Implements data-parallel training in a single process, i.e., the model gets replicated to each device and each gets a split of the data.""" - distributed_backend = _StrategyType.DP + distributed_backend = "dp" def __init__( self, @@ -149,6 +149,15 @@ def training_step_end(self, output): return output + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) + + def teardown(self) -> None: super().teardown() if self.root_device.type == "cuda": diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py index 9a24197c6c33d..4a05abd0dd9d8 100644 --- a/pytorch_lightning/strategies/fully_sharded.py +++ b/pytorch_lightning/strategies/fully_sharded.py @@ -36,7 +36,7 @@ class DDPFullyShardedStrategy(DDPStrategy): - distributed_backend = _StrategyType.DDP_FULLY_SHARDED + distributed_backend = "ddp_fully_sharded" def __init__( self, @@ -212,3 +212,9 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( "fsdp", cls, description="Fully sharded training with checkpointing the full state dict." ) + + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) diff --git a/pytorch_lightning/strategies/horovod.py b/pytorch_lightning/strategies/horovod.py index a69850b60f9c0..90b091a9eee18 100644 --- a/pytorch_lightning/strategies/horovod.py +++ b/pytorch_lightning/strategies/horovod.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union, Dict import torch import torch.nn as nn @@ -37,7 +37,7 @@ class HorovodStrategy(ParallelStrategy): """Plugin for Horovod distributed training integration.""" - distributed_backend = _StrategyType.HOROVOD + distributed_backend = "horovod" def __init__( self, @@ -196,6 +196,14 @@ def _filter_named_parameters(model: nn.Module, optimizer: Optimizer) -> List[Tup opt_params = {p for group in optimizer.param_groups for p in group.get("params", [])} return [(name, p) for name, p in model.named_parameters() if p in opt_params] + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) + def teardown(self) -> None: super().teardown() # teardown may be called before `_exit_stack` is set diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index 6b6433841d5ae..c13431d1ad8d8 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -13,7 +13,7 @@ # limitations under the License. import json import os -from typing import Any, Callable, List, Optional, Union +from typing import Any, Callable, List, Optional, Union, Dict import torch from torch.utils.data import DataLoader @@ -62,6 +62,8 @@ def _move_float_tensors_to_half(self, batch: Any) -> Any: class IPUStrategy(ParallelStrategy): """Plugin for training on IPU devices.""" + distributed_backend = "ipu" + def __init__( self, accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, @@ -360,3 +362,11 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra def broadcast(self, obj: object, src: int = 0) -> object: return obj + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) diff --git a/pytorch_lightning/strategies/parallel.py b/pytorch_lightning/strategies/parallel.py index 11207065b7e21..d8a8ab50abe2d 100644 --- a/pytorch_lightning/strategies/parallel.py +++ b/pytorch_lightning/strategies/parallel.py @@ -85,6 +85,14 @@ def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=len(self.parallel_devices), rank=self.global_rank) return distributed_sampler_kwargs + @property + def parallel_devices(self): + return self._parallel_devices + + @parallel_devices.setter + def parallel_devices(self, parallel_devices): + self._parallel_devices = parallel_devices + def reconciliate_processes(self, trace: str): """Function to re-conciliate processes on failure.""" diff --git a/pytorch_lightning/strategies/sharded.py b/pytorch_lightning/strategies/sharded.py index 2d1584a2e15e5..1f402126b6efe 100644 --- a/pytorch_lightning/strategies/sharded.py +++ b/pytorch_lightning/strategies/sharded.py @@ -37,7 +37,7 @@ class DDPShardedStrategy(DDPStrategy): """Optimizer and gradient sharded training provided by FairScale.""" - distributed_backend = _StrategyType.DDP_SHARDED + distributed_backend = "ddp_sharded" _REDUCE_BUFFER_SIZE_DEFAULT: int = 2 ** 23 # 8M def configure_ddp(self) -> None: @@ -135,3 +135,8 @@ def register_strategies(cls, strategy_registry: Dict) -> None: description="DDP Sharded Strategy with `find_unused_parameters` as False", find_unused_parameters=False, ) + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) diff --git a/pytorch_lightning/strategies/sharded_spawn.py b/pytorch_lightning/strategies/sharded_spawn.py index 289e3491be0b4..1a7c6b6e00d1c 100644 --- a/pytorch_lightning/strategies/sharded_spawn.py +++ b/pytorch_lightning/strategies/sharded_spawn.py @@ -36,7 +36,7 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy): """Optimizer sharded training provided by FairScale.""" - distributed_backend = _StrategyType.DDP_SHARDED_SPAWN + distributed_backend = "ddp_sharded_spawn" def configure_ddp(self) -> None: self.model, self.optimizers = self._setup_model_and_optimizers( @@ -118,3 +118,8 @@ def register_strategies(cls, strategy_registry: Dict) -> None: description="DDP Spawn Sharded Strategy with `find_unused_parameters` as False", find_unused_parameters=False, ) + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) diff --git a/pytorch_lightning/strategies/single_device.py b/pytorch_lightning/strategies/single_device.py index 440c73afce8fc..f866dfe204ade 100644 --- a/pytorch_lightning/strategies/single_device.py +++ b/pytorch_lightning/strategies/single_device.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import annotations -from typing import Any +from typing import Any, Dict import torch @@ -26,10 +26,11 @@ class SingleDeviceStrategy(Strategy): """Strategy that handles communication on a single device.""" + distributed_backend = "single_device" def __init__( self, - device: _DEVICE, + device: _DEVICE = "cpu", accelerator: pl.accelerators.accelerator.Accelerator | None = None, checkpoint_io: CheckpointIO | None = None, precision_plugin: PrecisionPlugin | None = None, @@ -79,6 +80,14 @@ def barrier(self, *args, **kwargs) -> None: def broadcast(self, obj: object, src: int = 0) -> object: return obj + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) + def teardown(self) -> None: super().teardown() if self.root_device.type == "cuda": diff --git a/pytorch_lightning/strategies/single_tpu.py b/pytorch_lightning/strategies/single_tpu.py index 8465656f034ab..3d471f2dabd24 100644 --- a/pytorch_lightning/strategies/single_tpu.py +++ b/pytorch_lightning/strategies/single_tpu.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Optional +from typing import Optional, Dict import pytorch_lightning as pl from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO @@ -27,6 +27,7 @@ class SingleTPUStrategy(SingleDeviceStrategy): """Strategy for training on a single TPU device.""" + distributed_backend = "single_tpu" def __init__( self, @@ -71,6 +72,14 @@ def setup(self, trainer: "pl.Trainer") -> None: def model_to_device(self) -> None: self.model.to(self.root_device) + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) + def teardown(self) -> None: super().teardown() # TPU teardown diff --git a/pytorch_lightning/strategies/strategy.py b/pytorch_lightning/strategies/strategy.py index 629911911b780..4b339e0b0efb4 100644 --- a/pytorch_lightning/strategies/strategy.py +++ b/pytorch_lightning/strategies/strategy.py @@ -441,7 +441,12 @@ def teardown(self) -> None: @classmethod def register_strategies(cls, strategies_registry) -> None: - pass + if cls.distributed_backend: + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) def on_train_start(self) -> None: """Called when train begins.""" diff --git a/pytorch_lightning/strategies/tpu_spawn.py b/pytorch_lightning/strategies/tpu_spawn.py index a6e82441da296..4bcf0d1ef31b6 100644 --- a/pytorch_lightning/strategies/tpu_spawn.py +++ b/pytorch_lightning/strategies/tpu_spawn.py @@ -52,6 +52,8 @@ class TPUSpawnStrategy(DDPSpawnStrategy): """Strategy for training multiple TPU devices using the :func:`torch.multiprocessing.spawn` method.""" + distributed_backend = "tpu_spawn" + def __init__( self, accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, @@ -346,3 +348,9 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( "tpu_spawn_debug", cls, description="TPUSpawn Strategy with `debug` as True", debug=True ) + + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector_new.py b/pytorch_lightning/trainer/connectors/accelerator_connector_new.py index 186d175f33d64..8c69ef6b8ad5a 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector_new.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector_new.py @@ -109,19 +109,19 @@ def __init__( gpu_ids, ): """ - A. accelerator could be: + A. accelerator flag could be: 1. strategy class (deprecated in 1.5 will be removed in 1.7) 2. strategy str (deprecated in 1.5 will be removed in 1.7) 3. accelerator class 4. accelerator str 5. accelerator auto - B. strategy could be : + B. strategy flag could be : 1. strategy class 2. strategy str registered with strategyRegister 3. strategy str in _strategy_type enum which listed in each strategy as backend (registed these too, and _strategy_type could be deprecated) - C. plugins could be: + C. plugins flag could be: 1. List of str, which could contains: i. strategy str ii. precision str (Not supported in the old accelerator_connector version) @@ -141,6 +141,7 @@ def __init__( """ + # --Parsing_flags------------------------------------------------------ # Get registered strategies, existing accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() print(self._existing_strategies_str) @@ -154,26 +155,40 @@ def __init__( self._config_check_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) self._device_config_check_and_set_final_flags(devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores) - # handle auto and choose flag when user hasn't set it up. + + # --Accelerator------------------------------------------------------------- + # handle `auto` and `None` if self._accelerator_flag == 'auto' or self._accelerator_flag is None: self._choose_accelerator() - else: - # [RFC] move to XAccelerator class init? - self._check_device_availibility() - - # Accelerator initialization - # TODO devices logic handling still in process, not ready for reviews + # else: + # # [RFC] move to XAccelerator class init? + # self._check_device_availibility() self._set_parallel_devices_and_init_accelerator() - # handle strategy flag is not set, choose for user + + # --Cluster_environment----------------------------------------------------- + self._choose_and_init_cluster_environment() + + + # --Strategy Part 1 : choose strategy --------------------------------------- if self._strategy_flag is None: self._choose_strategy() + # Reset strategy even user has specificed one + self._strategy_fallbacks() - self._choose_and_init_cluster_environment() + + # --Precision---------------------------------------------------------------- self._check_capatibility_and_init_precision() + + + # --Strategy Part 2 : init Strategy and set Strategy properties ------------- self._init_strategy() + # set properties not used in accelerator_connector. TODO move out of this file + # self.gpus = gpus or devices + self.replace_sampler_ddp = replace_sampler_ddp + def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): """ This method checks: @@ -207,6 +222,11 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." ) self._strategy_flag = accelerator + elif accelerator == "ddp_cpu": + rank_zero_warn( + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." + ) + self._strategy_flag = accelerator if precision: @@ -243,6 +263,7 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl # if self._precision_flag: # raise MisconfigurationException("precision set through both strategy class and flags, choose one place to set") # else: + print("here") self._precision_flag = self._strategy_flag.precision_plugin if self._strategy_flag.checkpoint_io: if self.checkpoint_io: @@ -273,9 +294,11 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce else : self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 + self._device_flag = devices ##### to be deleted v1.7 - deprecated_devices_specific_nums = num_processes or gpus or ipus or tpu_cores - self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(devices, deprecated_devices_specific_nums, num_processes, gpus, ipus, tpu_cores) + deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores + if deprecated_devices_specific_flag: + self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores) ##### deleted end if devices == "auto": if self._accelerator_flag is None: @@ -283,19 +306,16 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce f"You passed `devices={devices}` but haven't specified" " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" ) - if not self._device_flag: - self._device_flag = devices - - def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(self, devices, deprecated_devices_specific_nums, num_processes, gpus, ipus, tpu_cores): + def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores): ##### to be deleted v1.7vbg # set devices base on num_processes, gpus, ipus, tpu_cores if devices: - rank_zero_warn(f"will be ignored, instand the device specific number {deprecated_devices_specific_nums} will be used") + rank_zero_warn(f"will be ignored, instand the device specific number {deprecated_devices_specific_flag} will be used") if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count(True) > 1: rank_zero_warn(f"more than one device specifc flag has been set") - self._device_flag = deprecated_devices_specific_nums + self._device_flag = deprecated_devices_specific_flag if not self._accelerator_flag: # set accelerator type base on num_processes, gpus, ipus, tpu_cores @@ -319,6 +339,8 @@ def _choose_accelerator(self): self._accelerator_flag = "gpu" else: self._accelerator_flag = "cpu" + if self._device_flag == "auto": + self._device_flag = 1 # [RFC] this is current logic, if accelerator not set, default cpu? else: self._accelerator_flag = "cpu" @@ -335,7 +357,7 @@ def _set_parallel_devices_and_init_accelerator(self): self._parallel_devices = [] if isinstance(self._accelerator_flag, Accelerator): - self.accelerator = self._accelerator_flag() + self.accelerator = self._accelerator_flag elif self._accelerator_flag == "tpu": self.accelerator = TPUAccelerator() if self._device_flag == "auto" or not self._device_flag: @@ -354,8 +376,12 @@ def _set_parallel_devices_and_init_accelerator(self): self.accelerator = GPUAccelerator() if self._device_flag == "auto" or not self._device_flag: self._device_flag = GPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int): + if isinstance(self._device_flag, int) or isinstance(self._device_flag, str): + self._device_flag = int(self._device_flag) self._parallel_devices = [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] + elif isinstance(self._device_flag, list): + self._parallel_devices = [torch.device("cuda", i) for i in self._device_flag] + elif self._accelerator_flag == "cpu": self.accelerator = CPUAccelerator() @@ -364,6 +390,8 @@ def _set_parallel_devices_and_init_accelerator(self): if isinstance(self._device_flag, int): self._parallel_devices = [torch.device("cpu")] * self._device_flag + self._gpus = self._device_flag + def _choose_and_init_cluster_environment(self): self.cluster_environment = LightningEnvironment() @@ -383,8 +411,10 @@ def _is_slurm_managing_tasks(self): used by choosing cluster enviroment """ if ( - (not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") - or not SLURMEnvironment.detect() + #(not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") + # the above logic moved to _select_strategy(), only check _is_slurm_managing_tasks() + # when strategy flag is ddp or ddp2 + not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash" # in interactive mode we don't manage tasks ): return False @@ -398,39 +428,70 @@ def _choose_strategy(self): self._strategy_flag = HorovodStrategy() if self._accelerator_flag == "ipu": - self._strategy_flag = IPUStrategy() + self._strategy_flag = "ipu" elif self._accelerator_flag == "tpu": if self._parallel_devices and len(self._parallel_devices)>1: - self._strategy_flag = TPUSpawnStrategy() + self._strategy_flag = "tpu_spawn" else: self._srategy_flag = SingleTPUStrategy() # [RFC] in existing logic SingleDevice strategy choice diverge between cpu and gpu, should we merge? - elif self._accelerator_flag == "gpu": - if self._num_nodes_flag > 1: - self._strategy_flag = DDPStrategy() - elif len(self._parallel_devices) == 1: - self._strategy_flag = DDPStrategy() - elif len(self._parallel_devices) > 1: - self._strategy_flag = DDPSpawnStrategy() - else: - self._strategy_flag = DDPStrategy() + # elif self._accelerator_flag == "gpu": + # if self._num_nodes_flag > 1: + # self._strategy_flag = "ddp" + # elif TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks(): + # self._strategy_flag = "ddp" + # elif len(self._parallel_devices) == 1: + # self._strategy_flag = "ddp" + # elif len(self._parallel_devices) > 1: + # self._strategy_flag = "ddp_spawn" + # else: + # self._strategy_flag = "ddp" else: if self._num_nodes_flag > 1: - self._strategy_flag = DDPStrategy() + self._strategy_flag = "ddp" + elif TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks(): + self._strategy_flag = "ddp" elif len(self._parallel_devices) <= 1: device = torch.device("cuda") if self._accelerator_flag == "gpu" else "cpu" self._strategy_flag = SingleDeviceStrategy(device = device) elif len(self._parallel_devices) > 1: - self._strategy_flag = DDPSpawnStrategy() + self._strategy_flag = "ddp_spawn" else: - self._strategy_flag = DDPStrategy() + self._strategy_flag = "ddp" + + def _strategy_fallbacks(self): + _strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag + if _strategy_flag == "ddp_cpu": + if _TPU_AVAILABLE: + raise MisconfigurationException( + "`accelerator='ddp_cpu'` is not supported on TPU machines. " + "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" + ) + if self._device_flag ==1 and self._num_nodes_flag > 1: + _strategy_flag = "ddp" + else: + _strategy_flag = "ddp_spawn" + if self._accelerator_flag == "gpu": + rank_zero_warn( + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." + ) + # if self._accelerator_flag == "cpu": + # self._parallel_devices = os.cpu_count() + + if "ddp_spawn" in _strategy_flag and (TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): + _strategy_flag = "ddp" + + if _strategy_flag: + self._strategy_flag = _strategy_flag def _check_capatibility_and_init_precision(self): + print(self._precision_flag) self._precision_misconfig_check() if isinstance(self._precision_flag, PrecisionPlugin): self.precision_plugin = self._precision_flag + return if self._accelerator_flag =="ipu": self.precision_plugin = IPUPrecisionPlugin(self._precision_flag) @@ -485,6 +546,7 @@ def _check_capatibility_and_init_precision(self): self.precision_plugin = PrecisionPlugin() def _precision_misconfig_check(self): + if self._accelerator_flag == "ipu": if self._precision_flag not in (16, 32): raise MisconfigurationException( @@ -516,6 +578,7 @@ def _precision_misconfig_check(self): def _init_strategy(self): + print(self._strategy_flag) if isinstance(self._strategy_flag, str): self.strategy = StrategyRegistry.get(self._strategy_flag) else: @@ -526,44 +589,92 @@ def _init_strategy(self): if self.checkpoint_io: self.strategy.checkpoint_io = self.checkpoint_io self.strategy.cluster_environment = self.cluster_environment + if hasattr(self.strategy, "parallel_devices"): + self.strategy.parallel_devices = self._parallel_devices ############################################################################## - # the following logic should be deprecated/removed + # the following logic should be deprecated/removed, and these information should be + # retrive from strategies and accelerators # Added here to keep backward compabilities - # @property - # def parallel_devices(self) -> List[Union[torch.device, int]]: - # return self._parallel_device - - # @property - # def replace_sampler_ddp(): - # return self.replace_sampler_ddp + @property + def parallel_devices(self) -> List[Union[torch.device, int]]: + return self._parallel_devices # def _distrib_type(): - - # def _device_type(): - - # def num_nodes(): - - # def num_processes(): - - # def root_gpu(): - + @property + def device_type(self): + if isinstance(self.accelerator, CPUAccelerator): + return "cpu" + if isinstance(self.accelerator, GPUAccelerator): + return "gpu" + if isinstance(self.accelerator, TPUAccelerator): + return "tpu" + if isinstance(self.accelerator, IPUAccelerator): + return "ipu" + + @property + def num_nodes(self): + return self._num_nodes + + @property + def num_processes(self): + return self.devices + + @property + def root_gpu(self) -> Optional[int]: + return ( + self.strategy.root_device.index + if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator)) + else None + ) + + @property def devices(self): return len(self._parallel_devices) - # def parallel_device_ids(): + @property + def tpu_cores(self) -> int: + return self.devices + + @property + def ipus(self) -> int: + return self.devices - # def gpus(): + @property + def num_gpus(self) -> int: + return self.devices - # def is_distributed(): + # def parallel_device_ids(): + @property + def gpus(self): + return self._gpus if isinstance(self.accelerator, GPUAccelerator) else None + + + def is_distributed(self): + # Used for custom plugins. + # Custom plugins should implement is_distributed property. + if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator): + return self.strategy.is_distributed + distributed_strategy = (DDP2Strategy, DDPStrategy, DDPSpawnShardedStrategy, DDPShardedStrategy, DDPFullyShardedStrategy, DDPSpawnStrategy, DeepSpeedStrategy, TPUSpawnStrategy, HorovodStrategy) + is_distributed = isinstance(self.strategy, distributed_strategy) + if isinstance(self.accelerator, TPUAccelerator): + is_distributed |= self.strategy.is_distributed + return is_distributed def has_ipu(self): - return self._accelerator_flag == "ipu" + return isinstance(self.accelerator, IPUAccelerator) def has_tpu(self): - return self._accelerator_flag == "tpu" + return isinstance(self.accelerator, TPUAccelerator) + + def use_dp(self): + return isinstance(self.strategy, DataParallelStrategy) + + @property + def _strategy_type(self) -> _StrategyType: + return self.strategy.distributed_backend diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index bb3dde1e893a3..93fd6187be1ea 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1968,45 +1968,45 @@ def should_rank_save_checkpoint(self) -> bool: isinstance(strategy, pl.strategies.TPUSpawnStrategy) and strategy.local_rank == 0 or strategy.is_global_zero ) - # @property - # def _strategy_type(self) -> _StrategyType: - # return self._accelerator_connector._strategy_type + @property + def _strategy_type(self) -> _StrategyType: + return self.strategy.distributed_backend - # @property - # def _device_type(self) -> _AcceleratorType: - # return self._accelerator_connector._device_type + @property + def _device_type(self) -> _AcceleratorType: + return self._accelerator_connector.device_type - # @property - # def num_nodes(self) -> int: - # return self._accelerator_connector.num_nodes + @property + def num_nodes(self) -> int: + return self._accelerator_connector.num_nodes - # @property - # def num_processes(self) -> int: - # return self._accelerator_connector.num_processes + @property + def num_processes(self) -> int: + return self._accelerator_connector.num_processes - # @property - # def root_gpu(self) -> Optional[int]: - # return self._accelerator_connector.root_gpu + @property + def root_gpu(self) -> Optional[int]: + return self._accelerator_connector.root_gpu - # @property - # def tpu_cores(self) -> int: - # return self._accelerator_connector.tpu_cores + @property + def tpu_cores(self) -> int: + return self._accelerator_connector.tpu_cores - # @property - # def ipus(self) -> int: - # return self._accelerator_connector.num_ipus + @property + def ipus(self) -> int: + return self._accelerator_connector.num_ipus - # @property - # def num_gpus(self) -> int: - # return self._accelerator_connector.num_gpus + @property + def num_gpus(self) -> int: + return self._accelerator_connector.num_gpus @property def devices(self) -> Optional[Union[List[int], str, int]]: return self._accelerator_connector.devices - # @property - # def data_parallel_device_ids(self) -> Optional[List[int]]: - # return self._accelerator_connector.parallel_device_ids + @property + def data_parallel_device_ids(self) -> Optional[List[int]]: + return self._accelerator_connector.parallel_devices @property def lightning_module(self) -> "pl.LightningModule": From 7307969152cbf83f2fa58f816e1ee6d9d23e7563 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 24 Jan 2022 22:20:25 -0800 Subject: [PATCH 03/69] update --- .../connectors/accelerator_connector.py | 1327 +++++++---------- .../connectors/accelerator_connector_new.py | 680 --------- .../test_accelerator_connector.py | 14 +- 3 files changed, 520 insertions(+), 1501 deletions(-) delete mode 100644 pytorch_lightning/trainer/connectors/accelerator_connector_new.py diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index fd65975618f02..72c9a78f06602 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -66,10 +66,11 @@ ) from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, AMPType, device_parser from pytorch_lightning.utilities.enums import PrecisionType -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.exceptions import MisconfigurationException, DeviceNotAvailibleException, ImpactableConfigurationException from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, _IPU_AVAILABLE, + _GPU_AVAILABLE, _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE, ) @@ -84,421 +85,326 @@ class AcceleratorConnector: def __init__( self, - num_processes, devices, - tpu_cores, - ipus, - accelerator, - strategy: Optional[Union[str, Strategy]], - gpus, - gpu_ids, num_nodes, + accelerator, # reduce typing + strategy: Optional[Union[str, Strategy]], + plugins, + precision, + amp_type, + amp_level, sync_batchnorm, benchmark, replace_sampler_ddp, deterministic: bool, - precision, - amp_type, - amp_level, - plugins, + num_processes, # deprecated + tpu_cores, # deprecated + ipus, # deprecated + gpus, # deprecated + gpu_ids, ): - # initialization - self._device_type = _AcceleratorType.CPU - self._strategy_type = None - self._accelerator_type = None - - self._strategy_flag = strategy.lower() if isinstance(strategy, str) else strategy - # TODO: Rename this to something else once all the distributed flags are moved to strategy - self.distributed_backend = accelerator - - self._init_deterministic(deterministic) - - self.num_processes = num_processes - self.devices = devices - # `gpus` is the input passed to the Trainer, whereas `gpu_ids` is a list of parsed gpu ids. - self.gpus = gpus - self.parallel_device_ids = gpu_ids - self.tpu_cores = tpu_cores - self.ipus = ipus - self.num_nodes = num_nodes - self.sync_batchnorm = sync_batchnorm - self.benchmark = benchmark - self.replace_sampler_ddp = replace_sampler_ddp - if not PrecisionType.supported_type(precision): - raise MisconfigurationException( - f"Precision {repr(precision)} is invalid. Allowed precision values: {PrecisionType.supported_types()}" - ) - self.precision = precision - self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None - self.amp_level = amp_level - - self._precision_plugin: Optional[PrecisionPlugin] = None - self._strategy: Optional[Strategy] = None - self._cluster_environment: Optional[ClusterEnvironment] = None - self._checkpoint_io: Optional[CheckpointIO] = None - - plugins = plugins if plugins is not None else [] + """ + A. accelerator flag could be: + 1. strategy class (deprecated in 1.5 will be removed in 1.7) + 2. strategy str (deprecated in 1.5 will be removed in 1.7) + 3. accelerator class + 4. accelerator str + 5. accelerator auto + + B. strategy flag could be : + 1. strategy class + 2. strategy str registered with strategyRegister + 3. strategy str in _strategy_type enum which listed in each strategy as backend (registed these too, and _strategy_type could be deprecated) + + C. plugins flag could be: + 1. List of str, which could contains: + i. strategy str + ii. precision str (Not supported in the old accelerator_connector version) + iii. checkpoint_io str (Not supported in the old accelerator_connector version) + iv. cluster_environment str (Not supported in the old accelerator_connector version) + 2. List of class, which could contains: + i. strategy class (deprecated in 1.5 will be removed in 1.7) + ii. precision class (should be removed, and precision flag should allow user pass classes) + iii. checkpoint_io class + iv. cluster_environment class + + + priorities which to take when: + A. Class > str + B. Strategy > Accelerator/precision/plugins + C. When multiple flag set to the same thing? (ignore? not handled for now) - if isinstance(plugins, str): - plugins = [plugins] + """ - if not isinstance(plugins, Sequence): - plugins = [plugins] + # --Parsing_flags------------------------------------------------------ + # Get registered strategies, existing accelerators and precision plugins + self._existing_strategies_str = StrategyRegistry.available_strategies() + print(self._existing_strategies_str) + self._existing_accelerator_type = ["tpu", "ipu", "gpu", "cpu"] + self._supported_precision = PrecisionType.supported_types() - self.plugins = plugins + # raise misconfig exceptions if their is conflict between flags + # set the valid flag to self._x_flag after validation + # for example: if accelerator is strategy class, set self._strategy_flag = accelerator + # for devices: assign gpus ipus and etcs to accelerator_flag and devices_flag + self._config_check_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) + self._device_config_check_and_set_final_flags(devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores) - self._handle_accelerator_and_strategy() - self._validate_accelerator_and_devices() + # --Accelerator------------------------------------------------------------- + # handle `auto` and `None` + if self._accelerator_flag == 'auto' or self._accelerator_flag is None: + self._choose_accelerator() + # else: + # # [RFC] move to XAccelerator class init? + # self._check_device_availibility() + self._set_parallel_devices_and_init_accelerator() - self._warn_if_devices_flag_ignored() - self.select_accelerator_type() + # --Cluster_environment----------------------------------------------------- + self._choose_and_init_cluster_environment() - if self._strategy_flag is not None: - self._set_strategy() - else: - self.set_distributed_mode() - self.handle_given_plugins() - self._set_strategy_type_if_strategy_passed() + # --Strategy Part 1 : choose strategy --------------------------------------- + if self._strategy_flag is None: + self._choose_strategy() + # Reset strategy even user has specificed one + self._strategy_fallbacks() + self._init_strategy() - self._cluster_environment = self.select_cluster_environment() + # --Precision---------------------------------------------------------------- + self._check_capatibility_and_init_precision() - self.update_device_type_if_ipu_plugin() - self.update_device_type_if_strategy_passed() - self._validate_accelerator_type() - self._set_devices_if_none() + # --Strategy Part 2 : init Strategy and set Strategy properties ------------- + self._lazy_init_strategy() - self.strategy = self.final_strategy() - self.accelerator = self.strategy.accelerator - self._check_plugin_compatibility() - # benchmarking - # TODO: should this be moved to GPU accelerator? - torch.backends.cudnn.benchmark = self.benchmark + # set properties not used in accelerator_connector. TODO move out of this file + # self.gpus = gpus or devices self.replace_sampler_ddp = replace_sampler_ddp - def _init_deterministic(self, deterministic: bool) -> None: - self.deterministic = deterministic - if _TORCH_GREATER_EQUAL_1_8: - torch.use_deterministic_algorithms(deterministic) - else: - torch.set_deterministic(deterministic) - if deterministic: - # fixing non-deterministic part of horovod - # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 - os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) - # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility - os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" - - def select_accelerator_type(self) -> None: - if self.distributed_backend == "auto": - if self.has_tpu: - self._accelerator_type = _AcceleratorType.TPU - elif self.has_ipu: - self._accelerator_type = _AcceleratorType.IPU - elif self.has_gpu: - self._accelerator_type = _AcceleratorType.GPU - else: - self._set_devices_to_cpu_num_processes() - self._accelerator_type = _AcceleratorType.CPU - elif self.distributed_backend == _AcceleratorType.TPU: - if not self.has_tpu: - msg = "TPUs are not available" if not _TPU_AVAILABLE else "you didn't pass `tpu_cores` to `Trainer`" - raise MisconfigurationException(f"You passed `accelerator='tpu'`, but {msg}.") - self._accelerator_type = _AcceleratorType.TPU - elif self.distributed_backend == _AcceleratorType.IPU: - if not self.has_ipu: - msg = "IPUs are not available" if not _IPU_AVAILABLE else "you didn't pass `ipus` to `Trainer`" - raise MisconfigurationException(f"You passed `accelerator='ipu'`, but {msg}.") - self._accelerator_type = _AcceleratorType.IPU - elif self.distributed_backend == _AcceleratorType.GPU: - if not self.has_gpu: - msg = "you didn't pass `gpus` to `Trainer`" if torch.cuda.is_available() else "GPUs are not available" - raise MisconfigurationException(f"You passed `accelerator='gpu'`, but {msg}.") - self._accelerator_type = _AcceleratorType.GPU - elif self.distributed_backend == _AcceleratorType.CPU: - self._set_devices_to_cpu_num_processes() - self._accelerator_type = _AcceleratorType.CPU - - if self.distributed_backend in self.accelerator_types: - self.distributed_backend = None - - def _validate_accelerator_and_devices(self) -> None: - if self.distributed_backend not in self.accelerator_types and self.devices is not None: - raise MisconfigurationException( - f"You passed `devices={self.devices}` but haven't specified" - " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping," - f" got `accelerator={self.distributed_backend!r}`." - ) - - def _validate_accelerator_type(self) -> None: - if self._accelerator_type and self._accelerator_type != self._device_type: - # internal error: should not happen. - raise ValueError( - f"Mismatch between the requested accelerator type ({self._accelerator_type})" - f" and assigned device type ({self._device_type})." - ) - self._accelerator_type = self._device_type - - def _warn_if_devices_flag_ignored(self) -> None: - if self.devices is None: - return - devices_warning = f"The flag `devices={self.devices}` will be ignored, as you have set" - if self.distributed_backend in ("auto", _AcceleratorType.TPU): - if self.tpu_cores is not None: - rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") - elif self.distributed_backend in ("auto", _AcceleratorType.IPU): - if self.ipus is not None: - rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") - elif self.distributed_backend in ("auto", _AcceleratorType.GPU): - if self.gpus is not None: - rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") - elif self.distributed_backend in ("auto", _AcceleratorType.CPU): - if self.num_processes != 1: - rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") - - def _set_devices_if_none(self) -> None: - if self.devices is not None: - return - if self._accelerator_type == _AcceleratorType.TPU: - self.devices = self.tpu_cores - elif self._accelerator_type == _AcceleratorType.IPU: - self.devices = self.ipus - elif self._accelerator_type == _AcceleratorType.GPU: - self.devices = self.gpus - elif self._accelerator_type == _AcceleratorType.CPU: - self.devices = self.num_processes - - def _handle_accelerator_and_strategy(self) -> None: - deprecated_types = [t for t in _StrategyType if t not in (_StrategyType.TPU_SPAWN, _StrategyType.DDP_CPU)] - if self.distributed_backend is not None and self.distributed_backend in deprecated_types: - rank_zero_deprecation( - f"Passing `Trainer(accelerator={self.distributed_backend!r})` has been deprecated" - f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={self.distributed_backend!r})` instead." - ) - if self._strategy_flag is not None: + def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): + """ + This method checks: + 1. strategy flag: strategy, accelerator and plugin can all set strategies + 2. accelerator: if accelerator flag is Accelerator related flag or class, set self._acceelrator_flag; + If accelerator is strategy related, logic handled in 1 above + 3. precision could be set by precision and plugins flag + 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), set checkpoint_io and cluster_environment + """ + self._strategy_flag, self._accelerator_flag, self._precision_flag, self._cluster_environment, self.checkpoint_io, self._amp_level_flag, self._amp_type_flag = None, None, None, None, None, amp_type, amp_level + if strategy: + self._strategy_flag = strategy + if strategy == "ddp_cpu": raise MisconfigurationException( - f"You have passed `Trainer(strategy={self._strategy_flag!r})` but have" - f" also passed `Trainer(accelerator={self.distributed_backend!r})`." - f" HINT: Use just `Trainer(strategy={self._strategy_flag!r})` instead." + "`Trainer(strategy='ddp_cpu')` is not a valid strategy," + " you can use `Trainer(strategy='ddp'|'ddp_spawn', accelerator='cpu')` instead." ) - if self._strategy_flag == _StrategyType.TPU_SPAWN: - raise MisconfigurationException( + if strategy == "tpu_spawn": + raise MisconfigurationException( "`Trainer(strategy='tpu_spawn')` is not a valid strategy," " you can use `Trainer(strategy='ddp_spawn', accelerator='tpu')` instead." ) - if self._strategy_flag == _StrategyType.DDP_CPU: - raise MisconfigurationException( - "`Trainer(strategy='ddp_cpu')` is not a valid strategy," - " you can use `Trainer(strategy='ddp'|'ddp_spawn', accelerator='cpu')` instead." + # handle duplications and conflict + if isinstance(accelerator, Strategy) and strategy != accelerator: + raise MisconfigurationException("strategy already set through strategy flag, duplicated in accelerator") + if isinstance(accelerator, str) and accelerator in self._existing_strategies_str and strategy != accelerator: + raise MisconfigurationException("strategy str already set through strategy flag, duplicated in accelerator") + if plugins: + for plugin in plugins: + if isinstance(plugin, Strategy) and strategy != plugin: + raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") + if isinstance(plugin, str) and plugin in self._existing_strategies_str: + raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") + + + if accelerator in self._existing_accelerator_type or accelerator=="auto" or isinstance(accelerator, Accelerator): + self._accelerator_flag = accelerator + elif accelerator in self._existing_strategies_str or isinstance(accelerator, Strategy): + rank_zero_deprecation( + f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" + f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." ) - - def _set_strategy(self) -> None: - if isinstance(self._strategy_flag, str) and self._strategy_flag in StrategyRegistry: - self._strategy = StrategyRegistry.get(self._strategy_flag) - if isinstance(self._strategy_flag, str): - self.set_distributed_mode(self._strategy_flag) - elif isinstance(self._strategy_flag, Strategy): - self._strategy = self._strategy_flag - - def handle_given_plugins(self) -> None: - - for plug in self.plugins: - if self._strategy_flag is not None and self._is_plugin_training_type(plug): - raise MisconfigurationException( - f"You have passed `Trainer(strategy={self._strategy_flag!r})`" - f" and you can only specify one training type plugin, but you have passed {plug} as a plugin." - ) - if self._is_plugin_training_type(plug): - rank_zero_deprecation( - f"Passing {plug} `strategy` to the `plugins` flag in Trainer has been deprecated" - f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={plug})` instead." + self._strategy_flag = accelerator + elif accelerator == "ddp_cpu": + rank_zero_warn( + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) - - strategy = self._strategy or None - checkpoint = None - precision = None - cluster_environment = None - - for plug in self.plugins: - if isinstance(plug, str) and plug in StrategyRegistry: - if strategy is None: - strategy = StrategyRegistry.get(plug) + self._strategy_flag = accelerator + + if precision: + self._precision_flag = precision + # handle duplications and conflict + if plugins: + for plugin in plugins: + if isinstance(plugin, PrecisionPlugin): + raise MisconfigurationException("precision set in both precision flag and plugin flag") + + if plugins: + plugins = [plugins] if not isinstance(plugins, list) else plugins + for plugin in plugins: + print(plugin) + if isinstance(plugin, Strategy) or isinstance(plugin, str) and plugin in self._existing_strategies_str: + self._strategy_flag = plugin + rank_zero_deprecation( + f"Passing {plugin} `strategy` to the `plugins` flag in Trainer has been deprecated" + f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={plugin})` instead." + ) + + elif isinstance(plugin, PrecisionPlugin) or isinstance(plugin, str) and plugin in self._supported_precision: + self._precision_flag = plugin + elif isinstance(plugin, CheckpointIO): + self.checkpoint_io = plugin + elif isinstance(plugin, ClusterEnvironment): + self._cluster_environment = plugin else: - raise MisconfigurationException( - "You can only specify one precision and one training type plugin." - " Found more than 1 training type plugin:" - f' {StrategyRegistry[plug]["strategy"]} registered to {plug}' - ) - if isinstance(plug, str): - # Reset the distributed type as the user has overridden training type - # via the plugins argument - self._strategy_type = None - self.set_distributed_mode(plug) + raise MisconfigurationException(f"Does not recognize flag {plugin}") - elif isinstance(plug, Strategy): - if strategy is None: - strategy = plug + + # if user pass in a strategy class which has accelerator, precision, checkpoint or cluster env set up + if self._strategy_flag and isinstance(self._strategy_flag, Strategy): + if self._strategy_flag.accelerator: + if self._accelerator_flag: + raise MisconfigurationException("accelerator set through both strategy class and accelerator flag, choose one") else: - raise MisconfigurationException( - "You can only specify one training type plugin." - f" Available: {type(strategy).__name__}, given: {type(plug).__name__}" - ) - elif isinstance(plug, PrecisionPlugin): - if precision is None: - precision = plug - else: - raise MisconfigurationException( - "You can only specify one precision plugin." - f" Available: {type(precision).__name__}, given: {type(plug).__name__}" - ) - elif isinstance(plug, CheckpointIO): - if checkpoint is None: - checkpoint = plug + self._accelerator_flag = self._strategy_flag.accelerator + if self._strategy_flag.precision_plugin: + # precision has default value 32, we can not tell whether user set it or not [RFC] remove default from trainer? + # if self._precision_flag: + # raise MisconfigurationException("precision set through both strategy class and flags, choose one place to set") + # else: + self._precision_flag = self._strategy_flag.precision_plugin + if self._strategy_flag.checkpoint_io: + if self.checkpoint_io: + raise MisconfigurationException("checkpoint_io set through both strategy class and plugins, choose one") else: - raise MisconfigurationException( - "You can only specify one checkpoint plugin." - f" Available: {type(checkpoint).__name__}, given: {type(plug).__name__}" - ) - elif isinstance(plug, ClusterEnvironment): - if cluster_environment is None: - cluster_environment = plug + self.checkpoint_io = self._strategy_flag.checkpoint_io + if getattr(self._strategy_flag, "cluster_environment", None): + if self._cluster_environment: + raise MisconfigurationException("cluster_environment set through both strategy class and plugins, choose one") else: - raise MisconfigurationException( - "You can only specify one cluster environment. Found more than 1 cluster environment plugin" - ) - else: + self._cluster_environment = getattr(self._strategy_flag, "cluster_environment") + + + amp_type = amp_type.lower() if isinstance(amp_type, str) else None + self._amp_type_flag = AMPType.from_str(amp_type) if amp_type is not None else None + + # TODO still working on these flags + # if amp_level is not None and self._amp_type_flag != AMPType.APEX: + # raise MisconfigurationException( + # f"You have asked for `amp_level={self._amp_level_flag!r}` but it's only supported with `amp_backend='apex'`." + # ) + self._amp_level_flag = amp_level + + + def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores): + if num_nodes == "auto": + self._num_nodes_flag = 1 + else : + self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 + + self._device_flag = devices + ##### to be deleted v1.7 + deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores + if deprecated_devices_specific_flag: + self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores) + ##### deleted end + if devices == "auto": + if self._accelerator_flag is None: raise MisconfigurationException( - f"Found invalid type for plugin {plug}. Expected a precision or training type plugin." + f"You passed `devices={devices}` but haven't specified" + " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" ) - self._strategy = strategy - self._precision_plugin = precision - self._checkpoint_io = checkpoint - self._cluster_environment = cluster_environment - - @property - def accelerator_types(self) -> List[str]: - return ["auto"] + list(_AcceleratorType) - - @property - def precision_plugin(self) -> PrecisionPlugin: - if self._precision_plugin is None: - self._precision_plugin = self.select_precision_plugin() - return self._precision_plugin - - def final_strategy(self) -> Strategy: - if self._strategy is None: - self._strategy = self.select_strategy() - self._strategy = self.resolve_strategy(self._strategy) - # attach checkpoint plugin to the training type plugin - if self._checkpoint_io is not None: - self._strategy.checkpoint_io = self._checkpoint_io - if ( - isinstance(self._strategy_flag, Strategy) and self._strategy_flag._precision_plugin is None - ) or not isinstance(self._strategy_flag, Strategy): - precision_plugin = self.precision_plugin - if precision_plugin is not None: - self._strategy.precision_plugin = precision_plugin - if (isinstance(self._strategy_flag, Strategy) and self._strategy_flag.accelerator is None) or not isinstance( - self._strategy_flag, Strategy - ): - self._strategy.accelerator = self.select_accelerator() - return self._strategy - - @property - def cluster_environment(self) -> ClusterEnvironment: - if self._cluster_environment is None: - self._cluster_environment = self.select_cluster_environment() - return self._cluster_environment - - @property - def has_cpu(self) -> bool: - return True - - @property - def use_cpu(self) -> bool: - return self._accelerator_type == _AcceleratorType.CPU - - @property - def has_gpu(self) -> bool: - # Here, we are not checking for GPU availability, but instead if User has passed - # `gpus` to Trainer for training. - gpus = self.parallel_device_ids - if gpus is not None and len(gpus) > 0: - return True - return self._map_devices_to_accelerator(_AcceleratorType.GPU) - - @property - def use_gpu(self) -> bool: - return self._accelerator_type == _AcceleratorType.GPU and self.has_gpu - - @property - def has_tpu(self) -> bool: - # Here, we are not checking for TPU availability, but instead if User has passed - # `tpu_cores` to Trainer for training. - if self.tpu_cores is not None: - return True - return self._map_devices_to_accelerator(_AcceleratorType.TPU) - @property - def use_tpu(self) -> bool: - return self._accelerator_type == _AcceleratorType.TPU and self.has_tpu - - @property - def tpu_id(self) -> Optional[int]: - if self.use_tpu and isinstance(self.tpu_cores, list): - return self.tpu_cores[0] - return None - - @property - def has_ipu(self) -> bool: - # Here, we are not checking for IPU availability, but instead if User has passed - # `ipus` to Trainer for training. - if self.ipus is not None or isinstance(self._strategy, IPUStrategy): - return True - return self._map_devices_to_accelerator(_AcceleratorType.IPU) - - @property - def use_ipu(self) -> bool: - return self._accelerator_type == _AcceleratorType.IPU and self.has_ipu - - def _set_devices_to_cpu_num_processes(self) -> None: - if self.num_processes == 1: - self._map_devices_to_accelerator(_AcceleratorType.CPU) - - def _map_devices_to_accelerator(self, accelerator: str) -> bool: - if self.devices is None: - return False - if accelerator == _AcceleratorType.TPU and _TPU_AVAILABLE: - if self.devices == "auto": - self.devices = TPUAccelerator.auto_device_count() - self.tpu_cores = device_parser.parse_tpu_cores(self.devices) - return True - if accelerator == _AcceleratorType.IPU and _IPU_AVAILABLE: - if self.devices == "auto": - self.devices = IPUAccelerator.auto_device_count() - self.ipus = self.devices - return True - if accelerator == _AcceleratorType.GPU and torch.cuda.is_available(): - if self.devices == "auto": - self.devices = GPUAccelerator.auto_device_count() - self.gpus = self.devices - self.parallel_device_ids = device_parser.parse_gpu_ids(self.devices) - return True - if accelerator == _AcceleratorType.CPU: - if self.devices == "auto": - self.devices = CPUAccelerator.auto_device_count() - if not isinstance(self.devices, int): + def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores): + ##### to be deleted v1.7vbg + # set devices base on num_processes, gpus, ipus, tpu_cores + if devices: + rank_zero_warn(f"The flag `devices={devices}` will be ignored, instand the device specific number {deprecated_devices_specific_flag} will be used") + if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count(True) > 1: + rank_zero_warn(f"more than one device specifc flag has been set") + self._device_flag = deprecated_devices_specific_flag + + if not self._accelerator_flag: + # set accelerator type base on num_processes, gpus, ipus, tpu_cores + if num_processes: + self._accelerator_flag = "cpu" + if gpus: + self._accelerator_flag = "gpu" + if tpu_cores: + self._accelerator_flag = "tpu" + if ipus: + self._accelerator_flag = "ipu" + #### delete end + + def _choose_accelerator(self): + if self._accelerator_flag == "auto": + if _TPU_AVAILABLE: + self._accelerator_flag = "tpu" + elif _IPU_AVAILABLE: + self._accelerator_flag = "ipu" + elif _GPU_AVAILABLE: + self._accelerator_flag = "gpu" + else: + self._accelerator_flag = "cpu" + if self._device_flag == "auto": + self._device_flag = 1 + # [RFC] this is current logic, if accelerator not set, default cpu? + else: + self._accelerator_flag = "cpu" + + + def _check_device_availibility(self): + for accelerator_flag, available in zip(self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True]): + if self._accelerator_flag == accelerator_flag: + if not available: + raise DeviceNotAvailibleException(f"{accelerator_flag} not avalible") + + # TODO in progress for setting up devices + def _set_parallel_devices_and_init_accelerator(self): + self._parallel_devices = [] + + if isinstance(self._accelerator_flag, Accelerator): + self.accelerator = self._accelerator_flag + elif self._accelerator_flag == "tpu": + self.accelerator = TPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = TPUAccelerator.auto_device_count() + if isinstance(self._device_flag, int): + self._parallel_devices = list(range(self._device_flag)) + + elif self._accelerator_flag == "ipu": + self.accelerator = IPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = IPUAccelerator.auto_device_count() + if isinstance(self._device_flag, int): + self._parallel_devices = list(range(self._device_flag)) + + elif self._accelerator_flag == "gpu": + self.accelerator = GPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = GPUAccelerator.auto_device_count() + if isinstance(self._device_flag, int) or isinstance(self._device_flag, str): + self._device_flag = int(self._device_flag) + self._parallel_devices = [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] + elif isinstance(self._device_flag, list): + self._parallel_devices = [torch.device("cuda", i) for i in self._device_flag] + + + elif self._accelerator_flag == "cpu": + self.accelerator = CPUAccelerator() + if self._device_flag == "auto" or not self._device_flag: + self._device_flag = CPUAccelerator.auto_device_count() + if not isinstance(self._device_flag, int): raise MisconfigurationException( "The flag `devices` must be an int with `accelerator='cpu'`," - f" got `devices={self.devices}` instead." + f" got `devices={self._device_flag}` instead." ) +<<<<<<< HEAD self.num_processes = self.devices return True return False @@ -519,19 +425,26 @@ def use_ddp(self) -> bool: _StrategyType.DEEPSPEED, _StrategyType.TPU_SPAWN, ) +======= + self._parallel_devices = [torch.device("cpu")] * self._device_flag +>>>>>>> dccae1d6f (update) - @property - def use_ddp2(self) -> bool: - return self._strategy_type == _StrategyType.DDP2 + self._gpus = self._device_flag - @property - def use_horovod(self) -> bool: - return self._strategy_type == _StrategyType.HOROVOD - @property - def use_deepspeed(self) -> bool: - return self._strategy_type == _StrategyType.DEEPSPEED + def _choose_and_init_cluster_environment(self): + self.cluster_environment = LightningEnvironment() + if isinstance(self._cluster_environment, ClusterEnvironment): + self.cluster_environment = self._cluster_environment + elif self._is_slurm_managing_tasks(): + rank_zero_info("Multiprocessing is handled by SLURM.") + self.cluster_environment = SLURMEnvironment() + else: + for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): + if env_type.detect(): + self.cluster_environment = env_type() +<<<<<<< HEAD @property def use_bagua(self) -> bool: return self._strategy_type == _StrategyType.BAGUA @@ -539,493 +452,277 @@ def use_bagua(self) -> bool: @property def _is_sharded_training_type(self) -> bool: return isinstance(self._strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)) +======= +>>>>>>> dccae1d6f (update) - @property - def _is_fully_sharded_training_type(self) -> bool: - return isinstance(self._strategy, DDPFullyShardedStrategy) - - @property - def is_distributed(self) -> bool: - # Used for custom plugins. - # Custom plugins should implement is_distributed property. - if hasattr(self.strategy, "is_distributed") and not self.use_tpu: - return self.strategy.is_distributed - is_distributed = self.use_ddp or self.use_ddp2 or self.use_horovod - if self.use_tpu: - is_distributed |= self.strategy.is_distributed - return is_distributed + def _is_slurm_managing_tasks(self): + """ + used by choosing cluster enviroment + """ + if ( + #(not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") + # the above logic moved to _select_strategy(), only check _is_slurm_managing_tasks() + # when strategy flag is ddp or ddp2 + not SLURMEnvironment.detect() + or SLURMEnvironment.job_name() == "bash" + ): + return False - @property - def num_gpus(self) -> int: - gpus = self.parallel_device_ids - if gpus is None: - return 0 - return len(gpus) + total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag + num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) + return num_slurm_tasks == total_requested_devices - @property - def num_ipus(self) -> int: - if isinstance(self.ipus, int): - return self.ipus - if isinstance(self._strategy, IPUStrategy): - return self._strategy.replication_factor - return 0 + def _choose_strategy(self): + if _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): + self._strategy_flag = HorovodStrategy() - @property - def parallel_devices(self) -> List[Union[torch.device, int]]: - if self.use_gpu: - devices = [torch.device("cuda", i) for i in self.parallel_device_ids] - elif self.use_tpu: - # explicitly don't make a tpu device here! - # https://github.com/PyTorchLightning/pytorch-lightning/issues/3169 - if isinstance(self.tpu_cores, int): - devices = list(range(self.tpu_cores)) - elif self.use_ipu: - devices = list(range(self.num_ipus)) + if self._accelerator_flag == "ipu": + self._strategy_flag = "ipu" + elif self._accelerator_flag == "tpu": + if self._parallel_devices and len(self._parallel_devices)>1: + self._strategy_flag = "tpu_spawn" + else: + self._srategy_flag = SingleTPUStrategy() else: - devices = [torch.device("cpu")] * self.num_processes - return devices - - @property - def root_gpu(self) -> Optional[int]: - return ( - self.strategy.root_device.index - if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator)) - else None - ) - - @staticmethod - def _is_plugin_training_type(plugin: Union[str, Strategy]) -> bool: - if isinstance(plugin, str) and (plugin in StrategyRegistry or plugin in list(_StrategyType)): - return True - return isinstance(plugin, Strategy) - - @property - def is_training_type_in_plugins(self) -> bool: - return any( - (isinstance(plug, str) and plug in StrategyRegistry) or isinstance(plug, Strategy) for plug in self.plugins - ) - - def select_precision_plugin(self) -> PrecisionPlugin: - # set precision type - self.amp_type = AMPType.from_str(self.amp_type) + if self._num_nodes_flag > 1: + self._strategy_flag = "ddp" + elif TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks(): + self._strategy_flag = "ddp" + elif len(self._parallel_devices) <= 1: + device = torch.device("cuda") if self._accelerator_flag == "gpu" else "cpu" + self._strategy_flag = SingleDeviceStrategy(device = device) + elif len(self._parallel_devices) > 1: + self._strategy_flag = "ddp_spawn" + else: + self._strategy_flag = "ddp" - # validation for all plugins - if self.amp_level is not None and self.amp_type != AMPType.APEX: - raise MisconfigurationException( - f"You have asked for `amp_level={self.amp_level!r}` but it's only supported with `amp_backend='apex'`." - ) - if self.use_ipu: - if self.precision not in (16, 32): + def _strategy_fallbacks(self): + _strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag + if _strategy_flag == "ddp_cpu": + if _TPU_AVAILABLE: raise MisconfigurationException( - f"`Trainer(accelerator='ipu', precision={self.precision!r})` is not supported." + "`accelerator='ddp_cpu'` is not supported on TPU machines. " + "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) - return IPUPrecisionPlugin(self.precision) - if self.use_tpu: - if self.precision == 32: - return TPUPrecisionPlugin() - elif self.precision == 64: - raise MisconfigurationException( - "`Trainer(accelerator='tpu', precision=64)` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" - " requesting this feature." + if self._device_flag ==1 and self._num_nodes_flag > 1: + _strategy_flag = "ddp" + else: + _strategy_flag = "ddp_spawn" + if self._accelerator_flag == "gpu": + rank_zero_warn( + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) - elif self.precision in (16, "bf16"): - if self.precision == 16: + if "ddp_spawn" in _strategy_flag and (TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): + _strategy_flag = "ddp" + if _strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": + rank_zero_warn( + f"{_strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." + ) + _strategy_flag = "ddp" + if _strategy_flag: + self._strategy_flag = _strategy_flag + + def _init_strategy(self): + print(self._strategy_flag) + if isinstance(self._strategy_flag, str): + self.strategy = StrategyRegistry.get(self._strategy_flag) + else: + self.strategy = self._strategy_flag + + def _check_capatibility_and_init_precision(self): + print(self._precision_flag) + self._precision_misconfig_check() + if isinstance(self._precision_flag, PrecisionPlugin): + self.precision_plugin = self._precision_flag + return + + if self._accelerator_flag =="ipu": + self.precision_plugin = IPUPrecisionPlugin(self._precision_flag) + if self._accelerator_flag == "tpu": + if self._precision_flag == 32: + self.precision_plugin = TPUPrecisionPlugin() + elif self._precision_flag in (16, "bf16"): + if self._precision_flag == 16: # this is not deprecated to ease transition between accelerator environments rank_zero_warn( - f"You passed `Trainer(accelerator='tpu', precision=16)` but {self.amp_type.value} AMP" + f"You passed `Trainer(accelerator='tpu', precision=16)` but {self._amp_type_flag.value} AMP" f" is not supported with TPUs. Using `precision='bf16'` instead." ) - return TPUBf16PrecisionPlugin() + self.precision_plugin = TPUBf16PrecisionPlugin() + if self._strategy_flag == "deepspeed" or isinstance(self._strategy_flag, DeepSpeedStrategy): + self.precision_plugin = DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) - if self._strategy_type == _StrategyType.DEEPSPEED or isinstance(self._strategy, DeepSpeedStrategy): - return DeepSpeedPrecisionPlugin(self.precision, self.amp_type, self.amp_level) - - if self.precision == 32: - return PrecisionPlugin() - if self.precision == 64: - return DoublePrecisionPlugin() + if self._precision_flag == 32: + self.precision_plugin = PrecisionPlugin() + if self._precision_flag == 64: + self.precision_plugin = DoublePrecisionPlugin() # maybe convert the precision value - if self.precision == 16 and self.use_cpu: - if self.amp_type == AMPType.APEX: - # apex was explicitly passed, not a good idea to silently switch to native AMP - raise MisconfigurationException( - "You passed `Trainer(accelerator='cpu', precision=16, amp_type='apex')`" - " but apex AMP not supported on CPU." - ) + if self._precision_flag == 16 and self._accelerator_flag == "cpu": # this automatic switch is to ease transition between accelerator environments rank_zero_warn( "You passed `Trainer(accelerator='cpu', precision=16)` but native AMP is not supported on CPU." " Using `precision='bf16'` instead." ) - self.precision = "bf16" - - if self.precision in (16, "bf16"): - if self.precision == "bf16" and self.amp_type != AMPType.NATIVE: - raise MisconfigurationException( - f"You passed `Trainer(amp_type={self.amp_type.value!r}, precision='bf16')` but it's not supported." - " Try using `amp_type='native'` instead." - ) + self._precision_flag = "bf16" + if self._precision_flag in (16, "bf16"): rank_zero_info( - f"Using 16bit {self.amp_type.value} Automatic Mixed Precision (AMP)" - if self.precision == 16 + f"Using 16bit {self._amp_type_flag.value} Automatic Mixed Precision (AMP)" + if self._precision_flag == 16 else "Using bfloat16 Automatic Mixed Precision (AMP)" ) - if self.amp_type == AMPType.NATIVE: - device = "cpu" if self.use_cpu else "cuda" - - if self._is_sharded_training_type: - return ShardedNativeMixedPrecisionPlugin(self.precision, device) - if self._is_fully_sharded_training_type: - return FullyShardedNativeMixedPrecisionPlugin(self.precision, device) - return NativeMixedPrecisionPlugin(self.precision, device) - - if self.amp_type == AMPType.APEX: - if self._is_sharded_training_type or self._is_fully_sharded_training_type: - raise MisconfigurationException( - "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`." - ) - self.amp_level = self.amp_level or "O2" - return ApexMixedPrecisionPlugin(self.amp_level) - - raise RuntimeError("No precision set") - - def select_strategy(self) -> Strategy: - if isinstance(self.distributed_backend, Accelerator) and self.distributed_backend.strategy is not None: - plugin = self.distributed_backend.strategy - elif self.use_ddp2: - plugin = DDP2Strategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) - elif self.use_ddp and self.use_deepspeed: - plugin = DeepSpeedStrategy( - cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices - ) - elif self.use_ddp and self.use_bagua: - plugin = BaguaStrategy(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) - elif self.use_ddp: - use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks() - use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.detect() - use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.detect() - use_ddp_spawn = self._strategy_type == _StrategyType.DDP_SPAWN - use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu - use_tpu_spawn = self.use_tpu and self._strategy_type == _StrategyType.TPU_SPAWN - use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.detect() - use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.detect() - use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks() - use_ddp_sharded = self._strategy_type == _StrategyType.DDP_SHARDED - use_ddp_sharded_spawn = self._strategy_type == _StrategyType.DDP_SHARDED_SPAWN - use_ddp_fully_sharded = self._strategy_type == _StrategyType.DDP_FULLY_SHARDED - - if use_tpu_spawn: - ddp_strategy_cls = TPUSpawnStrategy - elif use_ddp_sharded: - ddp_strategy_cls = DDPShardedStrategy - elif use_ddp_sharded_spawn: - ddp_strategy_cls = DDPSpawnShardedStrategy - elif ( - use_ddp_cpu_slurm - or use_slurm_ddp - or use_ddp_cpu_torch_elastic - or use_torchelastic_ddp - or use_kubeflow_ddp - or use_ddp_cpu_kubeflow - ): - ddp_strategy_cls = DDPStrategy - elif use_ddp_spawn or use_ddp_cpu_spawn: - ddp_strategy_cls = DDPSpawnStrategy - elif use_ddp_fully_sharded: - ddp_strategy_cls = DDPFullyShardedStrategy - else: - ddp_strategy_cls = DDPStrategy - - plugin = ddp_strategy_cls( - parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment - ) - elif self.use_dp: - plugin = DataParallelStrategy(parallel_devices=self.parallel_devices) - elif self.use_horovod: - plugin = HorovodStrategy(parallel_devices=self.parallel_devices) - elif self.use_tpu and isinstance(self.tpu_cores, list): - plugin = SingleTPUStrategy(self.tpu_id) - elif self.use_ipu: - plugin = IPUStrategy(parallel_devices=self.parallel_devices) - else: - single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) - plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu") - return plugin - - def resolve_strategy(self, training_type: Strategy) -> Strategy: - # necessary for when the user has passed in a plugin - if hasattr(training_type, "parallel_devices") and getattr(training_type, "parallel_devices") is None: - training_type.parallel_devices = self.parallel_devices - - if hasattr(training_type, "cluster_environment") and getattr(training_type, "cluster_environment") is None: - # transfer ownership of the cluster environment to the training type - training_type.cluster_environment = self.cluster_environment - self._cluster_environment = proxy(self.cluster_environment) - - if hasattr(training_type, "num_nodes"): - # set num_nodes for training_type from trainer setting - training_type.num_nodes = self.num_nodes - - if hasattr(training_type, "sync_batchnorm"): - # set sync_batchnorm for training_type from trainer setting - training_type.sync_batchnorm = self.sync_batchnorm - - return training_type - - def select_accelerator(self) -> Accelerator: - if isinstance(self.distributed_backend, Accelerator): - # custom accelerator from user - if self._precision_plugin is not None or self._strategy is not None: - # plugins also specified by user - rank_zero_warn( - "Specified `Precision` and `TrainingType` plugins will be ignored," - " since an `Accelerator` instance was provided." - ) - return self.distributed_backend - - if self.use_gpu: - acc_cls = GPUAccelerator - elif self.use_tpu: - acc_cls = TPUAccelerator - elif self.use_ipu: - acc_cls = IPUAccelerator - else: - acc_cls = CPUAccelerator + if self._amp_type_flag == AMPType.NATIVE: + device = "cpu" if self._accelerator_flag=="cpu" else "cuda" - accelerator = acc_cls() - return accelerator + if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): + return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + if isinstance(self.strategy, DDPFullyShardedStrategy): + return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + return NativeMixedPrecisionPlugin(self._precision_flag, device) - def select_cluster_environment(self) -> ClusterEnvironment: - if self._cluster_environment is not None: - return self._cluster_environment - if self._is_slurm_managing_tasks(): - rank_zero_info("Multiprocessing is handled by SLURM.") - return SLURMEnvironment() + self._amp_level_flag = self._amp_level_flag or "O2" + self.precision_plugin = ApexMixedPrecisionPlugin(self._amp_level_flag) + self.precision_plugin = PrecisionPlugin() - for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): - if env_type.detect(): - return env_type() + def _precision_misconfig_check(self): - return LightningEnvironment() - - def set_distributed_mode(self, strategy: Optional[str] = None): - - if strategy is None and self.is_training_type_in_plugins: - return - - if strategy is not None and strategy in StrategyRegistry: - self.distributed_backend = StrategyRegistry[strategy]["distributed_backend"] - elif strategy is not None: - self.distributed_backend = strategy - - if isinstance(self.distributed_backend, Accelerator): - return - - is_cpu_accelerator_type = self._accelerator_type and self._accelerator_type == _AcceleratorType.CPU - _use_cpu = is_cpu_accelerator_type or self.distributed_backend and "cpu" in self.distributed_backend - - if self.distributed_backend is None: - if self.has_horovodrun(): - self._set_horovod_backend() - elif self.num_gpus == 0 and self.num_nodes > 1: - self._strategy_type = _StrategyType.DDP - elif self.num_gpus == 0 and self.num_processes > 1: - self.distributed_backend = _StrategyType.DDP_SPAWN - elif self.num_gpus > 1 and not _use_cpu: - rank_zero_warn( - "You requested multiple GPUs but did not specify a backend, e.g." - ' `Trainer(strategy="dp"|"ddp"|"ddp2")`. Setting `strategy="ddp_spawn"` for you.' + if self._accelerator_flag == "ipu": + if self._precision_flag not in (16, 32): + raise MisconfigurationException( + f"`Trainer(accelerator='ipu', precision={self._precision_flag!r})` is not supported." ) - self.distributed_backend = _StrategyType.DDP_SPAWN - - # special case with DDP on CPUs - if self.distributed_backend == _StrategyType.DDP_CPU: - if _TPU_AVAILABLE: + if self._accelerator_flag == "tpu" and self._precision_flag == 64: raise MisconfigurationException( - "`accelerator='ddp_cpu'` is not supported on TPU machines. " - "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" + "`Trainer(accelerator='tpu', precision=64)` is not implemented." + " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " requesting this feature." ) - if self.num_processes == 1 and self.num_nodes > 1: - self._strategy_type = _StrategyType.DDP - else: - self._strategy_type = _StrategyType.DDP_SPAWN - if self.num_gpus > 0: - rank_zero_warn( - "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." + if self._precision_flag == 16 and self._accelerator_flag == "cpu" and self._amp_type_flag == AMPType.APEX: + # apex was explicitly passed, not a good idea to silently switch to native AMP + raise MisconfigurationException( + "You passed `Trainer(accelerator='cpu', precision=16, amp_type='apex')`" + " but apex AMP not supported on CPU." ) - self.parallel_device_ids = None - if self.num_processes is None: - # define the max CPU available - self.num_processes = os.cpu_count() - # special case with TPUs - elif self.has_tpu and not _use_cpu: - self._device_type = _AcceleratorType.TPU - if isinstance(self.tpu_cores, int): - self._strategy_type = _StrategyType.TPU_SPAWN - elif self.has_ipu and not _use_cpu: - self._device_type = _AcceleratorType.IPU - elif self.distributed_backend and self._strategy_type is None: - self._strategy_type = _StrategyType(self.distributed_backend) - - if self.num_gpus > 0 and not _use_cpu: - self._device_type = _AcceleratorType.GPU - - _gpu_strategy_types = (_StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, _StrategyType.DDP2) - # DP and DDP2 cannot run without GPU - if self.num_gpus == 0 and self._strategy_type in _gpu_strategy_types and not _use_cpu: - - if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1): - if self._strategy_type in (_StrategyType.DP, _StrategyType.DDP2): - rank_zero_warn( - f"{self._strategy_type.value!r} is not supported on CPUs, hence setting `strategy='ddp'`." - ) - self._strategy_type = _StrategyType.DDP - else: - rank_zero_warn("You are running on single node with no parallelization, so distributed has no effect.") - self._strategy_type = None - - # finished configuring self._strategy_type, check ipython environment - self.check_interactive_compatibility() - - # for DDP overwrite nb processes by requested GPUs - if self._device_type == _AcceleratorType.GPU and self._strategy_type in ( - _StrategyType.DDP, - _StrategyType.DDP_SPAWN, - ): - self.num_processes = self.num_gpus - - if self._device_type == _AcceleratorType.GPU and self._strategy_type == _StrategyType.DDP2: - self.num_processes = self.num_nodes - - # Horovod is an extra case... - if self.distributed_backend == _StrategyType.HOROVOD: - self._set_horovod_backend() - - using_valid_distributed = self.use_ddp or self.use_ddp2 - if self.num_nodes > 1 and not using_valid_distributed: - # throw error to force user to choose a supported strategy type such as ddp or ddp2 + if self._precision_flag == "bf16" and self._amp_type_flag != AMPType.NATIVE: raise MisconfigurationException( - "Your chosen strategy does not support `num_nodes > 1`. Please set `strategy=('ddp'|'ddp2')`." + f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but it's not supported." + " Try using `amp_type='native'` instead." ) - def _set_horovod_backend(self): - self.check_horovod() - self._strategy_type = _StrategyType.HOROVOD + if self._precision_flag in (16, "bf16") and self._amp_type_flag == AMPType.APEX: + if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy, DDPFullyShardedStrategy)): + raise MisconfigurationException( + "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`." + ) - # Initialize Horovod to get rank / size info - hvd.init() - if self.has_gpu: - # Horovod assigns one local GPU per process - self.parallel_device_ids = list(range(hvd.local_size())) - else: - self.num_processes = hvd.local_size() - def check_interactive_compatibility(self): - """Raises a `MisconfigurationException` if the accelerator and/or plugin is not compatible with an - interactive environment.""" - from pytorch_lightning.utilities import _IS_INTERACTIVE + def _lazy_init_strategy(self): + # set strategy properties + self.strategy.accelerator = self.accelerator + if self.precision_plugin: + self.strategy.precision_plugin = self.precision_plugin + if self.checkpoint_io: + self.strategy.checkpoint_io = self.checkpoint_io + self.strategy.cluster_environment = self.cluster_environment + if hasattr(self.strategy, "parallel_devices"): + self.strategy.parallel_devices = self._parallel_devices - if _IS_INTERACTIVE and self._strategy_type is not None and not self._strategy_type.is_interactive_compatible(): + from pytorch_lightning.utilities import _IS_INTERACTIVE + interactive_compatible_strategy = ("dp", "ddp_spawn", "ddp_sharded_spawn", "tpu_spawn") + if _IS_INTERACTIVE and self.strategy.distributed_backend not in interactive_compatible_strategy: raise MisconfigurationException( - f"`Trainer(strategy={self._strategy_type.value!r})` or" - f" `Trainer(accelerator={self._strategy_type.value!r})` is not compatible with an interactive" + f"`Trainer(strategy={self.strategy.distributed_backend!r})` or" + f" `Trainer(accelerator={self.strategy.distributed_backend!r})` is not compatible with an interactive" " environment. Run your code as a script, or choose one of the compatible backends:" - f" {', '.join(_StrategyType.interactive_compatible_types())}." + f" {', '.join(interactive_compatible_strategy)}." " In case you are spawning processes yourself, make sure to include the Trainer" " creation inside the worker function." ) - def check_horovod(self): - """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod.""" - if not _HOROVOD_AVAILABLE: - raise MisconfigurationException( - 'Requested `accelerator="horovod"`, but Horovod is not installed.' - "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]" - ) - if self.num_gpus > 1 or self.num_nodes > 1: - raise MisconfigurationException( - "Horovod does not support setting num_nodes / num_gpus explicitly. Use " - "horovodrun / mpirun to configure the number of processes." - ) - @staticmethod - def has_horovodrun() -> bool: - """Returns True if running with `horovodrun` using Gloo or OpenMPI.""" - return _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ) - - def update_device_type_if_ipu_plugin(self) -> None: - # This allows the poptorch.Options that are passed into the IPUStrategy to be the source of truth, - # which gives users the flexibility to not have to pass `ipus` flag directly to Trainer - if isinstance(self._strategy, IPUStrategy) and self._device_type != _AcceleratorType.IPU: - self._device_type = _AcceleratorType.IPU - - def update_device_type_if_strategy_passed(self) -> None: - if isinstance(self._strategy_flag, Strategy) or any(isinstance(plug, Strategy) for plug in self.plugins): - if self._accelerator_type is not None: - if self.use_ipu: - self._device_type = _AcceleratorType.IPU - elif self.use_tpu: - self._device_type = _AcceleratorType.TPU - elif self.use_gpu: - self._device_type = _AcceleratorType.GPU - else: - if self.has_ipu: - self._device_type = _AcceleratorType.IPU - elif self.has_tpu: - self._device_type = _AcceleratorType.TPU - elif self.has_gpu: - self._device_type = _AcceleratorType.GPU - - def _set_strategy_type_if_strategy_passed(self): - # This is required as when `Strategy` instance is passed to either `strategy` - # or `plugins` flag, `AcceleratorConnector.set_distributed_mode` is not required to be - # called and `_strategy_type` is not set. - if self._strategy_type is not None: - return - if self._strategy is not None: - self._strategy_type = getattr(self._strategy, "distributed_backend", None) + ############################################################################## + # the following logic should be deprecated/removed, and these information should be + # retrive from strategies and accelerators + # Added here to keep backward compabilities - def _is_slurm_managing_tasks(self) -> bool: - """Returns whether we let SLURM manage the processes or not. + @property + def parallel_devices(self) -> List[Union[torch.device, int]]: + return self._parallel_devices - Returns ``True`` if and only if these conditions match: + # def _distrib_type(): + @property + def device_type(self): + if isinstance(self.accelerator, CPUAccelerator): + return "cpu" + if isinstance(self.accelerator, GPUAccelerator): + return "gpu" + if isinstance(self.accelerator, TPUAccelerator): + return "tpu" + if isinstance(self.accelerator, IPUAccelerator): + return "ipu" - - A SLURM cluster is detected - - A distributed plugin is being used - - The process is not launching in interactive mode - - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer - """ - if ( - (not self.use_ddp and not self.use_ddp2) - or not SLURMEnvironment.detect() - or SLURMEnvironment.job_name() == "bash" # in interactive mode we don't manage tasks - ): - return False + @property + def num_nodes(self): + return self._num_nodes - total_requested_devices = (self.num_gpus or self.num_processes) * self.num_nodes - num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) - return num_slurm_tasks == total_requested_devices + @property + def num_processes(self): + return self.devices - def _check_plugin_compatibility(self) -> None: - """Checks that selected plugins are compatible with each other. + @property + def root_gpu(self) -> Optional[int]: + return ( + self.strategy.root_device.index + if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator)) + else None + ) - Raises: - ValueError: If an invalid combination of Accelerator, Strategy, PrecisionPlugin is found. - """ + @property + def devices(self): + return len(self._parallel_devices) + + @property + def tpu_cores(self) -> int: + return self.devices + + @property + def ipus(self) -> int: + return self.devices + + @property + def num_gpus(self) -> int: + return self.devices + + # def parallel_device_ids(): + @property + def gpus(self): + return self._gpus if isinstance(self.accelerator, GPUAccelerator) else None + + + def is_distributed(self): + # Used for custom plugins. + # Custom plugins should implement is_distributed property. + if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator): + return self.strategy.is_distributed + distributed_strategy = (DDP2Strategy, DDPStrategy, DDPSpawnShardedStrategy, DDPShardedStrategy, DDPFullyShardedStrategy, DDPSpawnStrategy, DeepSpeedStrategy, TPUSpawnStrategy, HorovodStrategy) + is_distributed = isinstance(self.strategy, distributed_strategy) if isinstance(self.accelerator, TPUAccelerator): - if not isinstance(self.strategy.precision_plugin, TPUPrecisionPlugin): - raise ValueError( - f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," - f" found: {self.strategy.precision_plugin}." - ) - if not isinstance(self.strategy, (SingleTPUStrategy, TPUSpawnStrategy)): - raise ValueError( - "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy`," - f" found {self.strategy}." - ) + is_distributed |= self.strategy.is_distributed + return is_distributed + + def has_ipu(self): + return isinstance(self.accelerator, IPUAccelerator) + + def has_tpu(self): + return isinstance(self.accelerator, TPUAccelerator) + + def use_dp(self): + return isinstance(self.strategy, DataParallelStrategy) + + @property + def _strategy_type(self) -> _StrategyType: + return self.strategy.distributed_backend diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector_new.py b/pytorch_lightning/trainer/connectors/accelerator_connector_new.py deleted file mode 100644 index 8c69ef6b8ad5a..0000000000000 --- a/pytorch_lightning/trainer/connectors/accelerator_connector_new.py +++ /dev/null @@ -1,680 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -from typing import List, Optional, Sequence, Union -from weakref import proxy - -import torch - -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.accelerators.cpu import CPUAccelerator -from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.ipu import IPUAccelerator -from pytorch_lightning.accelerators.tpu import TPUAccelerator -from pytorch_lightning.plugins import ( - ApexMixedPrecisionPlugin, - CheckpointIO, - DeepSpeedPrecisionPlugin, - DoublePrecisionPlugin, - FullyShardedNativeMixedPrecisionPlugin, - IPUPrecisionPlugin, - NativeMixedPrecisionPlugin, - PrecisionPlugin, - ShardedNativeMixedPrecisionPlugin, - TPUBf16PrecisionPlugin, - TPUPrecisionPlugin, -) -from pytorch_lightning.plugins.environments import ( - ClusterEnvironment, - KubeflowEnvironment, - LightningEnvironment, - LSFEnvironment, - SLURMEnvironment, - TorchElasticEnvironment, -) -from pytorch_lightning.strategies import ( - DataParallelStrategy, - DDP2Strategy, - DDPFullyShardedStrategy, - DDPShardedStrategy, - DDPSpawnShardedStrategy, - DDPSpawnStrategy, - DDPStrategy, - DeepSpeedStrategy, - HorovodStrategy, - IPUStrategy, - SingleDeviceStrategy, - SingleTPUStrategy, - Strategy, - StrategyRegistry, - TPUSpawnStrategy, -) -from pytorch_lightning.utilities import ( - _AcceleratorType, - _StrategyType, - AMPType, - device_parser, - rank_zero_deprecation, - rank_zero_info, - rank_zero_warn, -) -from pytorch_lightning.utilities.enums import PrecisionType -from pytorch_lightning.utilities.exceptions import MisconfigurationException, DeviceNotAvailibleException, ImpactableConfigurationException -from pytorch_lightning.utilities.imports import ( - _HOROVOD_AVAILABLE, - _IPU_AVAILABLE, - _GPU_AVAILABLE, - _TORCH_GREATER_EQUAL_1_8, - _TPU_AVAILABLE, -) - -if _HOROVOD_AVAILABLE: - import horovod.torch as hvd - -log = logging.getLogger(__name__) - - -class AcceleratorConnector: - def __init__( - self, - devices, - num_nodes, - accelerator, # reduce typing - strategy: Optional[Union[str, Strategy]], - plugins, - precision, - amp_type, - amp_level, - sync_batchnorm, - benchmark, - replace_sampler_ddp, - deterministic: bool, - num_processes, # deprecated - tpu_cores, # deprecated - ipus, # deprecated - gpus, # deprecated - gpu_ids, - ): - """ - A. accelerator flag could be: - 1. strategy class (deprecated in 1.5 will be removed in 1.7) - 2. strategy str (deprecated in 1.5 will be removed in 1.7) - 3. accelerator class - 4. accelerator str - 5. accelerator auto - - B. strategy flag could be : - 1. strategy class - 2. strategy str registered with strategyRegister - 3. strategy str in _strategy_type enum which listed in each strategy as backend (registed these too, and _strategy_type could be deprecated) - - C. plugins flag could be: - 1. List of str, which could contains: - i. strategy str - ii. precision str (Not supported in the old accelerator_connector version) - iii. checkpoint_io str (Not supported in the old accelerator_connector version) - iv. cluster_environment str (Not supported in the old accelerator_connector version) - 2. List of class, which could contains: - i. strategy class (deprecated in 1.5 will be removed in 1.7) - ii. precision class (should be removed, and precision flag should allow user pass classes) - iii. checkpoint_io class - iv. cluster_environment class - - - priorities which to take when: - A. Class > str - B. Strategy > Accelerator/precision/plugins - C. When multiple flag set to the same thing? (ignore? not handled for now) - - """ - - # --Parsing_flags------------------------------------------------------ - # Get registered strategies, existing accelerators and precision plugins - self._existing_strategies_str = StrategyRegistry.available_strategies() - print(self._existing_strategies_str) - self._existing_accelerator_type = ["tpu", "ipu", "gpu", "cpu"] - self._supported_precision = PrecisionType.supported_types() - - # raise misconfig exceptions if their is conflict between flags - # set the valid flag to self._x_flag after validation - # for example: if accelerator is strategy class, set self._strategy_flag = accelerator - # for devices: assign gpus ipus and etcs to accelerator_flag and devices_flag - self._config_check_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) - self._device_config_check_and_set_final_flags(devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores) - - - # --Accelerator------------------------------------------------------------- - # handle `auto` and `None` - if self._accelerator_flag == 'auto' or self._accelerator_flag is None: - self._choose_accelerator() - # else: - # # [RFC] move to XAccelerator class init? - # self._check_device_availibility() - self._set_parallel_devices_and_init_accelerator() - - - # --Cluster_environment----------------------------------------------------- - self._choose_and_init_cluster_environment() - - - # --Strategy Part 1 : choose strategy --------------------------------------- - if self._strategy_flag is None: - self._choose_strategy() - # Reset strategy even user has specificed one - self._strategy_fallbacks() - - - # --Precision---------------------------------------------------------------- - self._check_capatibility_and_init_precision() - - - # --Strategy Part 2 : init Strategy and set Strategy properties ------------- - self._init_strategy() - - - # set properties not used in accelerator_connector. TODO move out of this file - # self.gpus = gpus or devices - self.replace_sampler_ddp = replace_sampler_ddp - - def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): - """ - This method checks: - 1. strategy flag: strategy, accelerator and plugin can all set strategies - 2. accelerator: if accelerator flag is Accelerator related flag or class, set self._acceelrator_flag; - If accelerator is strategy related, logic handled in 1 above - 3. precision could be set by precision and plugins flag - 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), set checkpoint_io and cluster_environment - """ - self._strategy_flag, self._accelerator_flag, self._precision_flag, self._cluster_environment, self.checkpoint_io, self._amp_level_flag, self._amp_type_flag = None, None, None, None, None, amp_type, amp_level - if strategy: - self._strategy_flag = strategy - # handle duplications and conflict - if isinstance(accelerator, Strategy) and strategy != accelerator: - raise MisconfigurationException("strategy already set through strategy flag, duplicated in accelerator") - if isinstance(accelerator, str) and accelerator in self._existing_strategies_str and strategy != accelerator: - raise MisconfigurationException("strategy str already set through strategy flag, duplicated in accelerator") - if plugins: - for plugin in plugins: - if isinstance(plugin, Strategy) and strategy != plugin: - raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") - if isinstance(plugin, str) and plugin in self._existing_strategies_str: - raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") - - - if accelerator in self._existing_accelerator_type or accelerator=="auto" or isinstance(accelerator, Accelerator): - self._accelerator_flag = accelerator - elif accelerator in self._existing_strategies_str or isinstance(accelerator, Strategy): - rank_zero_deprecation( - f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" - f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." - ) - self._strategy_flag = accelerator - elif accelerator == "ddp_cpu": - rank_zero_warn( - "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." - ) - self._strategy_flag = accelerator - - - if precision: - self._precision_flag = precision - # handle duplications and conflict - if plugins: - for plugin in plugins: - if isinstance(plugin, PrecisionPlugin): - raise MisconfigurationException("precision set in both precision flag and plugin flag") - - if plugins: - for plugin in plugins: - if isinstance(plugin, Strategy) or isinstance(plugin, str) and plugin in self._existing_strategies_str: - self._strategy_flag = plugin - elif isinstance(plugin, PrecisionPlugin) or isinstance(plugin, str) and plugin in self._supported_precision: - self._precision_flag = plugin - elif isinstance(plugin, CheckpointIO): - self.checkpoint_io = plugin - elif isinstance(plugin, ClusterEnvironment): - self._cluster_environment = plugin - else: - raise MisconfigurationException(f"Does not recognize flag {plugin}") - - - # if user pass in a strategy class which has accelerator, precision, checkpoint or cluster env set up - if self._strategy_flag and isinstance(self._strategy_flag, Strategy): - if self._strategy_flag.accelerator: - if self._accelerator_flag: - raise MisconfigurationException("accelerator set through both strategy class and accelerator flag, choose one") - else: - self._accelerator_flag = self._strategy_flag.accelerator - if self._strategy_flag.precision_plugin: - # precision has default value 32, we can not tell whether user set it or not [RFC] remove default from trainer? - # if self._precision_flag: - # raise MisconfigurationException("precision set through both strategy class and flags, choose one place to set") - # else: - print("here") - self._precision_flag = self._strategy_flag.precision_plugin - if self._strategy_flag.checkpoint_io: - if self.checkpoint_io: - raise MisconfigurationException("checkpoint_io set through both strategy class and plugins, choose one") - else: - self.checkpoint_io = self._strategy_flag.checkpoint_io - if getattr(self._strategy_flag, "cluster_environment", None): - if self._cluster_environment: - raise MisconfigurationException("cluster_environment set through both strategy class and plugins, choose one") - else: - self._cluster_environment = getattr(self._strategy_flag, "cluster_environment") - - - amp_type = amp_type.lower() if isinstance(amp_type, str) else None - self._amp_type_flag = AMPType.from_str(amp_type) if amp_type is not None else None - - # TODO still working on these flags - # if amp_level is not None and self._amp_type_flag != AMPType.APEX: - # raise MisconfigurationException( - # f"You have asked for `amp_level={self._amp_level_flag!r}` but it's only supported with `amp_backend='apex'`." - # ) - self._amp_level_flag = amp_level - - - def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores): - if num_nodes == "auto": - self._num_nodes_flag = 1 - else : - self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 - - self._device_flag = devices - ##### to be deleted v1.7 - deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores - if deprecated_devices_specific_flag: - self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores) - ##### deleted end - if devices == "auto": - if self._accelerator_flag is None: - raise MisconfigurationException( - f"You passed `devices={devices}` but haven't specified" - " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" - ) - - - def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores): - ##### to be deleted v1.7vbg - # set devices base on num_processes, gpus, ipus, tpu_cores - if devices: - rank_zero_warn(f"will be ignored, instand the device specific number {deprecated_devices_specific_flag} will be used") - if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count(True) > 1: - rank_zero_warn(f"more than one device specifc flag has been set") - self._device_flag = deprecated_devices_specific_flag - - if not self._accelerator_flag: - # set accelerator type base on num_processes, gpus, ipus, tpu_cores - if num_processes: - self._accelerator_flag = "cpu" - if gpus: - self._accelerator_flag = "gpu" - if tpu_cores: - self._accelerator_flag = "tpu" - if ipus: - self._accelerator_flag = "ipu" - #### delete end - - def _choose_accelerator(self): - if self._accelerator_flag == "auto": - if _TPU_AVAILABLE: - self._accelerator_flag = "tpu" - elif _IPU_AVAILABLE: - self._accelerator_flag = "ipu" - elif _GPU_AVAILABLE: - self._accelerator_flag = "gpu" - else: - self._accelerator_flag = "cpu" - if self._device_flag == "auto": - self._device_flag = 1 - # [RFC] this is current logic, if accelerator not set, default cpu? - else: - self._accelerator_flag = "cpu" - - - def _check_device_availibility(self): - for accelerator_flag, available in zip(self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True]): - if self._accelerator_flag == accelerator_flag: - if not available: - raise DeviceNotAvailibleException(f"{accelerator_flag} not avalible") - - # TODO in progress for setting up devices - def _set_parallel_devices_and_init_accelerator(self): - self._parallel_devices = [] - - if isinstance(self._accelerator_flag, Accelerator): - self.accelerator = self._accelerator_flag - elif self._accelerator_flag == "tpu": - self.accelerator = TPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = TPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int): - self._parallel_devices = list(range(self._device_flag)) - - elif self._accelerator_flag == "ipu": - self.accelerator = IPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = IPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int): - self._parallel_devices = list(range(self._device_flag)) - - elif self._accelerator_flag == "gpu": - self.accelerator = GPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = GPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int) or isinstance(self._device_flag, str): - self._device_flag = int(self._device_flag) - self._parallel_devices = [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] - elif isinstance(self._device_flag, list): - self._parallel_devices = [torch.device("cuda", i) for i in self._device_flag] - - - elif self._accelerator_flag == "cpu": - self.accelerator = CPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = CPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int): - self._parallel_devices = [torch.device("cpu")] * self._device_flag - - self._gpus = self._device_flag - - - def _choose_and_init_cluster_environment(self): - self.cluster_environment = LightningEnvironment() - if isinstance(self._cluster_environment, ClusterEnvironment): - self.cluster_environment = self._cluster_environment - elif self._is_slurm_managing_tasks(): - rank_zero_info("Multiprocessing is handled by SLURM.") - self.cluster_environment = SLURMEnvironment() - else: - for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): - if env_type.detect(): - self.cluster_environment = env_type() - - - def _is_slurm_managing_tasks(self): - """ - used by choosing cluster enviroment - """ - if ( - #(not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") - # the above logic moved to _select_strategy(), only check _is_slurm_managing_tasks() - # when strategy flag is ddp or ddp2 - not SLURMEnvironment.detect() - or SLURMEnvironment.job_name() == "bash" # in interactive mode we don't manage tasks - ): - return False - - total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag - num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) - return num_slurm_tasks == total_requested_devices - - def _choose_strategy(self): - if _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): - self._strategy_flag = HorovodStrategy() - - if self._accelerator_flag == "ipu": - self._strategy_flag = "ipu" - elif self._accelerator_flag == "tpu": - if self._parallel_devices and len(self._parallel_devices)>1: - self._strategy_flag = "tpu_spawn" - else: - self._srategy_flag = SingleTPUStrategy() - - # [RFC] in existing logic SingleDevice strategy choice diverge between cpu and gpu, should we merge? - # elif self._accelerator_flag == "gpu": - # if self._num_nodes_flag > 1: - # self._strategy_flag = "ddp" - # elif TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks(): - # self._strategy_flag = "ddp" - # elif len(self._parallel_devices) == 1: - # self._strategy_flag = "ddp" - # elif len(self._parallel_devices) > 1: - # self._strategy_flag = "ddp_spawn" - # else: - # self._strategy_flag = "ddp" - else: - if self._num_nodes_flag > 1: - self._strategy_flag = "ddp" - elif TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks(): - self._strategy_flag = "ddp" - elif len(self._parallel_devices) <= 1: - device = torch.device("cuda") if self._accelerator_flag == "gpu" else "cpu" - self._strategy_flag = SingleDeviceStrategy(device = device) - elif len(self._parallel_devices) > 1: - self._strategy_flag = "ddp_spawn" - else: - self._strategy_flag = "ddp" - - def _strategy_fallbacks(self): - _strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag - if _strategy_flag == "ddp_cpu": - if _TPU_AVAILABLE: - raise MisconfigurationException( - "`accelerator='ddp_cpu'` is not supported on TPU machines. " - "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" - ) - if self._device_flag ==1 and self._num_nodes_flag > 1: - _strategy_flag = "ddp" - else: - _strategy_flag = "ddp_spawn" - if self._accelerator_flag == "gpu": - rank_zero_warn( - "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." - ) - # if self._accelerator_flag == "cpu": - # self._parallel_devices = os.cpu_count() - - if "ddp_spawn" in _strategy_flag and (TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): - _strategy_flag = "ddp" - - if _strategy_flag: - self._strategy_flag = _strategy_flag - - - def _check_capatibility_and_init_precision(self): - print(self._precision_flag) - self._precision_misconfig_check() - if isinstance(self._precision_flag, PrecisionPlugin): - self.precision_plugin = self._precision_flag - return - - if self._accelerator_flag =="ipu": - self.precision_plugin = IPUPrecisionPlugin(self._precision_flag) - if self._accelerator_flag == "tpu": - if self._precision_flag == 32: - self.precision_plugin = TPUPrecisionPlugin() - elif self._precision_flag in (16, "bf16"): - if self._precision_flag == 16: - # this is not deprecated to ease transition between accelerator environments - rank_zero_warn( - f"You passed `Trainer(accelerator='tpu', precision=16)` but {self._amp_type_flag.value} AMP" - f" is not supported with TPUs. Using `precision='bf16'` instead." - ) - self.precision_plugin = TPUBf16PrecisionPlugin() - if self._strategy_flag == "deepspeed" or isinstance(self._strategy_flag, DeepSpeedStrategy): - self.precision_plugin = DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) - - if self._precision_flag == 32: - self.precision_plugin = PrecisionPlugin() - if self._precision_flag == 64: - self.precision_plugin = DoublePrecisionPlugin() - - # maybe convert the precision value - if self._precision_flag == 16 and self._accelerator_flag == "cpu": - # this automatic switch is to ease transition between accelerator environments - rank_zero_warn( - "You passed `Trainer(accelerator='cpu', precision=16)` but native AMP is not supported on CPU." - " Using `precision='bf16'` instead." - ) - self._precision_flag = "bf16" - - if self._precision_flag in (16, "bf16"): - rank_zero_info( - f"Using 16bit {self._amp_type_flag.value} Automatic Mixed Precision (AMP)" - if self._precision_flag == 16 - else "Using bfloat16 Automatic Mixed Precision (AMP)" - ) - - if self._amp_type_flag == AMPType.NATIVE: - device = "cpu" if self._accelerator_flag=="cpu" else "cuda" - - # TODO in progress implement the two following shard types - # if self._is_sharded_training_type: - # return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - # if self._is_fully_sharded_training_type: - # return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - # return NativeMixedPrecisionPlugin(self._precision_flag, device) - - - self._amp_level_flag = self._amp_level_flag or "O2" - self.precision_plugin = ApexMixedPrecisionPlugin(self._amp_level_flag) - self.precision_plugin = PrecisionPlugin() - - def _precision_misconfig_check(self): - - if self._accelerator_flag == "ipu": - if self._precision_flag not in (16, 32): - raise MisconfigurationException( - f"`Trainer(accelerator='ipu', precision={self._precision_flag!r})` is not supported." - ) - if self._accelerator_flag == "tpu" and self._precision_flag == 64: - raise MisconfigurationException( - "`Trainer(accelerator='tpu', precision=64)` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" - " requesting this feature." - ) - if self._precision_flag == 16 and self._accelerator_flag == "cpu" and self._amp_type_flag == AMPType.APEX: - # apex was explicitly passed, not a good idea to silently switch to native AMP - raise MisconfigurationException( - "You passed `Trainer(accelerator='cpu', precision=16, amp_type='apex')`" - " but apex AMP not supported on CPU." - ) - if self._precision_flag == "bf16" and self._amp_type_flag != AMPType.NATIVE: - raise MisconfigurationException( - f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but it's not supported." - " Try using `amp_type='native'` instead." - ) - - # if self._precision_flag in (16, "bf16") and self._amp_type_flag == AMPType.APEX: - # if self._is_sharded_training_type or self._is_fully_sharded_training_type: - # raise MisconfigurationException( - # "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`." - # ) - - - def _init_strategy(self): - print(self._strategy_flag) - if isinstance(self._strategy_flag, str): - self.strategy = StrategyRegistry.get(self._strategy_flag) - else: - self.strategy = self._strategy_flag - self.strategy.accelerator = self.accelerator - if self.precision_plugin: - self.strategy.precision_plugin = self.precision_plugin - if self.checkpoint_io: - self.strategy.checkpoint_io = self.checkpoint_io - self.strategy.cluster_environment = self.cluster_environment - if hasattr(self.strategy, "parallel_devices"): - self.strategy.parallel_devices = self._parallel_devices - - - - - - ############################################################################## - # the following logic should be deprecated/removed, and these information should be - # retrive from strategies and accelerators - # Added here to keep backward compabilities - - @property - def parallel_devices(self) -> List[Union[torch.device, int]]: - return self._parallel_devices - - # def _distrib_type(): - @property - def device_type(self): - if isinstance(self.accelerator, CPUAccelerator): - return "cpu" - if isinstance(self.accelerator, GPUAccelerator): - return "gpu" - if isinstance(self.accelerator, TPUAccelerator): - return "tpu" - if isinstance(self.accelerator, IPUAccelerator): - return "ipu" - - @property - def num_nodes(self): - return self._num_nodes - - @property - def num_processes(self): - return self.devices - - @property - def root_gpu(self) -> Optional[int]: - return ( - self.strategy.root_device.index - if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator)) - else None - ) - - @property - def devices(self): - return len(self._parallel_devices) - - @property - def tpu_cores(self) -> int: - return self.devices - - @property - def ipus(self) -> int: - return self.devices - - @property - def num_gpus(self) -> int: - return self.devices - - # def parallel_device_ids(): - @property - def gpus(self): - return self._gpus if isinstance(self.accelerator, GPUAccelerator) else None - - - def is_distributed(self): - # Used for custom plugins. - # Custom plugins should implement is_distributed property. - if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator): - return self.strategy.is_distributed - distributed_strategy = (DDP2Strategy, DDPStrategy, DDPSpawnShardedStrategy, DDPShardedStrategy, DDPFullyShardedStrategy, DDPSpawnStrategy, DeepSpeedStrategy, TPUSpawnStrategy, HorovodStrategy) - is_distributed = isinstance(self.strategy, distributed_strategy) - if isinstance(self.accelerator, TPUAccelerator): - is_distributed |= self.strategy.is_distributed - return is_distributed - - def has_ipu(self): - return isinstance(self.accelerator, IPUAccelerator) - - def has_tpu(self): - return isinstance(self.accelerator, TPUAccelerator) - - def use_dp(self): - return isinstance(self.strategy, DataParallelStrategy) - - @property - def _strategy_type(self) -> _StrategyType: - return self.strategy.distributed_backend diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 8ceb2de96c59c..8617b5a2c8095 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -468,10 +468,10 @@ def test_accelerator_gpu(): assert trainer._device_type == "gpu" assert isinstance(trainer.accelerator, GPUAccelerator) - with pytest.raises( - MisconfigurationException, match="You passed `accelerator='gpu'`, but you didn't pass `gpus` to `Trainer`" - ): - trainer = Trainer(accelerator="gpu") + # with pytest.raises( + # MisconfigurationException, match="You passed `accelerator='gpu'`, but you didn't pass `gpus` to `Trainer`" + # ): + trainer = Trainer(accelerator="gpu") trainer = Trainer(accelerator="auto", gpus=1) @@ -552,8 +552,10 @@ def test_accelerator_gpu_with_gpus_priority(): def test_validate_accelerator_and_devices(): - with pytest.raises(MisconfigurationException, match="You passed `devices=2` but haven't specified"): - Trainer(accelerator="ddp_cpu", devices=2) + # with pytest.raises(MisconfigurationException, match="You passed `devices=2` but haven't specified"): + trainer = Trainer(accelerator="ddp_cpu", devices=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert trainer.num_processes == 2 def test_set_devices_if_none_cpu(): From 3999d80b1f320d83121f6b3b580b6e94e6dd5bc4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jan 2022 06:22:30 +0000 Subject: [PATCH 04/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/strategies/ddp2.py | 3 +- pytorch_lightning/strategies/dp.py | 3 +- pytorch_lightning/strategies/horovod.py | 2 +- pytorch_lightning/strategies/single_device.py | 3 +- pytorch_lightning/strategies/single_tpu.py | 3 +- .../connectors/accelerator_connector.py | 244 +++++++++--------- pytorch_lightning/trainer/trainer.py | 34 +-- pytorch_lightning/utilities/exceptions.py | 2 + pytorch_lightning/utilities/imports.py | 2 +- 9 files changed, 157 insertions(+), 139 deletions(-) diff --git a/pytorch_lightning/strategies/ddp2.py b/pytorch_lightning/strategies/ddp2.py index 2633508e6bd82..ba8e769c35772 100644 --- a/pytorch_lightning/strategies/ddp2.py +++ b/pytorch_lightning/strategies/ddp2.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import torch from typing import Dict +import torch + from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.enums import _StrategyType diff --git a/pytorch_lightning/strategies/dp.py b/pytorch_lightning/strategies/dp.py index bcac4f4f156d5..01066a21c0e71 100644 --- a/pytorch_lightning/strategies/dp.py +++ b/pytorch_lightning/strategies/dp.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List, Optional, Dict +from typing import Any, Dict, List, Optional import torch from torch.nn import DataParallel, Module @@ -157,7 +157,6 @@ def register_strategies(cls, strategy_registry: Dict) -> None: description=f"{cls.__class__.__name__} Strategy", ) - def teardown(self) -> None: super().teardown() if self.root_device.type == "cuda": diff --git a/pytorch_lightning/strategies/horovod.py b/pytorch_lightning/strategies/horovod.py index 90b091a9eee18..1e99dbc429ed8 100644 --- a/pytorch_lightning/strategies/horovod.py +++ b/pytorch_lightning/strategies/horovod.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Any, List, Optional, Tuple, Union, Dict +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn diff --git a/pytorch_lightning/strategies/single_device.py b/pytorch_lightning/strategies/single_device.py index f866dfe204ade..adbd3d71371b5 100644 --- a/pytorch_lightning/strategies/single_device.py +++ b/pytorch_lightning/strategies/single_device.py @@ -26,6 +26,7 @@ class SingleDeviceStrategy(Strategy): """Strategy that handles communication on a single device.""" + distributed_backend = "single_device" def __init__( @@ -81,7 +82,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: return obj @classmethod - def register_strategies(cls, strategy_registry: Dict) -> None: + def register_strategies(cls, strategy_registry: dict) -> None: strategy_registry.register( cls.distributed_backend, cls, diff --git a/pytorch_lightning/strategies/single_tpu.py b/pytorch_lightning/strategies/single_tpu.py index 3d471f2dabd24..942f9ebfa9a41 100644 --- a/pytorch_lightning/strategies/single_tpu.py +++ b/pytorch_lightning/strategies/single_tpu.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Optional, Dict +from typing import Dict, Optional import pytorch_lightning as pl from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO @@ -27,6 +27,7 @@ class SingleTPUStrategy(SingleDeviceStrategy): """Strategy for training on a single TPU device.""" + distributed_backend = "single_tpu" def __init__( diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 72c9a78f06602..ec30e3469d451 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -66,11 +66,15 @@ ) from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, AMPType, device_parser from pytorch_lightning.utilities.enums import PrecisionType -from pytorch_lightning.utilities.exceptions import MisconfigurationException, DeviceNotAvailibleException, ImpactableConfigurationException +from pytorch_lightning.utilities.exceptions import ( + DeviceNotAvailibleException, + ImpactableConfigurationException, + MisconfigurationException, +) from pytorch_lightning.utilities.imports import ( + _GPU_AVAILABLE, _HOROVOD_AVAILABLE, _IPU_AVAILABLE, - _GPU_AVAILABLE, _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE, ) @@ -87,7 +91,7 @@ def __init__( self, devices, num_nodes, - accelerator, # reduce typing + accelerator, # reduce typing strategy: Optional[Union[str, Strategy]], plugins, precision, @@ -97,10 +101,10 @@ def __init__( benchmark, replace_sampler_ddp, deterministic: bool, - num_processes, # deprecated - tpu_cores, # deprecated - ipus, # deprecated - gpus, # deprecated + num_processes, # deprecated + tpu_cores, # deprecated + ipus, # deprecated + gpus, # deprecated gpu_ids, ): """ @@ -148,23 +152,22 @@ def __init__( # for example: if accelerator is strategy class, set self._strategy_flag = accelerator # for devices: assign gpus ipus and etcs to accelerator_flag and devices_flag self._config_check_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) - self._device_config_check_and_set_final_flags(devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores) - + self._device_config_check_and_set_final_flags( + devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores + ) # --Accelerator------------------------------------------------------------- # handle `auto` and `None` - if self._accelerator_flag == 'auto' or self._accelerator_flag is None: + if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._choose_accelerator() # else: # # [RFC] move to XAccelerator class init? # self._check_device_availibility() self._set_parallel_devices_and_init_accelerator() - # --Cluster_environment----------------------------------------------------- self._choose_and_init_cluster_environment() - # --Strategy Part 1 : choose strategy --------------------------------------- if self._strategy_flag is None: self._choose_strategy() @@ -175,26 +178,31 @@ def __init__( # --Precision---------------------------------------------------------------- self._check_capatibility_and_init_precision() - # --Strategy Part 2 : init Strategy and set Strategy properties ------------- self._lazy_init_strategy() - - # set properties not used in accelerator_connector. TODO move out of this file # self.gpus = gpus or devices self.replace_sampler_ddp = replace_sampler_ddp def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): + """This method checks: + + 1. strategy flag: strategy, accelerator and plugin can all set strategies + 2. accelerator: if accelerator flag is Accelerator related flag or class, set self._acceelrator_flag; + If accelerator is strategy related, logic handled in 1 above + 3. precision could be set by precision and plugins flag + 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), set checkpoint_io and cluster_environment """ - This method checks: - 1. strategy flag: strategy, accelerator and plugin can all set strategies - 2. accelerator: if accelerator flag is Accelerator related flag or class, set self._acceelrator_flag; - If accelerator is strategy related, logic handled in 1 above - 3. precision could be set by precision and plugins flag - 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), set checkpoint_io and cluster_environment - """ - self._strategy_flag, self._accelerator_flag, self._precision_flag, self._cluster_environment, self.checkpoint_io, self._amp_level_flag, self._amp_type_flag = None, None, None, None, None, amp_type, amp_level + ( + self._strategy_flag, + self._accelerator_flag, + self._precision_flag, + self._cluster_environment, + self.checkpoint_io, + self._amp_level_flag, + self._amp_type_flag, + ) = (None, None, None, None, None, amp_type, amp_level) if strategy: self._strategy_flag = strategy if strategy == "ddp_cpu": @@ -204,23 +212,36 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl ) if strategy == "tpu_spawn": raise MisconfigurationException( - "`Trainer(strategy='tpu_spawn')` is not a valid strategy," - " you can use `Trainer(strategy='ddp_spawn', accelerator='tpu')` instead." - ) + "`Trainer(strategy='tpu_spawn')` is not a valid strategy," + " you can use `Trainer(strategy='ddp_spawn', accelerator='tpu')` instead." + ) # handle duplications and conflict if isinstance(accelerator, Strategy) and strategy != accelerator: raise MisconfigurationException("strategy already set through strategy flag, duplicated in accelerator") - if isinstance(accelerator, str) and accelerator in self._existing_strategies_str and strategy != accelerator: - raise MisconfigurationException("strategy str already set through strategy flag, duplicated in accelerator") + if ( + isinstance(accelerator, str) + and accelerator in self._existing_strategies_str + and strategy != accelerator + ): + raise MisconfigurationException( + "strategy str already set through strategy flag, duplicated in accelerator" + ) if plugins: for plugin in plugins: if isinstance(plugin, Strategy) and strategy != plugin: - raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") + raise MisconfigurationException( + "strategy already set through strategy flag, duplicated in plugins" + ) if isinstance(plugin, str) and plugin in self._existing_strategies_str: - raise MisconfigurationException("strategy already set through strategy flag, duplicated in plugins") - + raise MisconfigurationException( + "strategy already set through strategy flag, duplicated in plugins" + ) - if accelerator in self._existing_accelerator_type or accelerator=="auto" or isinstance(accelerator, Accelerator): + if ( + accelerator in self._existing_accelerator_type + or accelerator == "auto" + or isinstance(accelerator, Accelerator) + ): self._accelerator_flag = accelerator elif accelerator in self._existing_strategies_str or isinstance(accelerator, Strategy): rank_zero_deprecation( @@ -230,8 +251,8 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl self._strategy_flag = accelerator elif accelerator == "ddp_cpu": rank_zero_warn( - "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." - ) + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." + ) self._strategy_flag = accelerator if precision: @@ -251,24 +272,28 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl rank_zero_deprecation( f"Passing {plugin} `strategy` to the `plugins` flag in Trainer has been deprecated" f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={plugin})` instead." - ) + ) - elif isinstance(plugin, PrecisionPlugin) or isinstance(plugin, str) and plugin in self._supported_precision: + elif ( + isinstance(plugin, PrecisionPlugin) + or isinstance(plugin, str) + and plugin in self._supported_precision + ): self._precision_flag = plugin elif isinstance(plugin, CheckpointIO): - self.checkpoint_io = plugin + self.checkpoint_io = plugin elif isinstance(plugin, ClusterEnvironment): self._cluster_environment = plugin else: raise MisconfigurationException(f"Does not recognize flag {plugin}") - - # if user pass in a strategy class which has accelerator, precision, checkpoint or cluster env set up if self._strategy_flag and isinstance(self._strategy_flag, Strategy): if self._strategy_flag.accelerator: if self._accelerator_flag: - raise MisconfigurationException("accelerator set through both strategy class and accelerator flag, choose one") + raise MisconfigurationException( + "accelerator set through both strategy class and accelerator flag, choose one" + ) else: self._accelerator_flag = self._strategy_flag.accelerator if self._strategy_flag.precision_plugin: @@ -279,16 +304,19 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl self._precision_flag = self._strategy_flag.precision_plugin if self._strategy_flag.checkpoint_io: if self.checkpoint_io: - raise MisconfigurationException("checkpoint_io set through both strategy class and plugins, choose one") + raise MisconfigurationException( + "checkpoint_io set through both strategy class and plugins, choose one" + ) else: self.checkpoint_io = self._strategy_flag.checkpoint_io if getattr(self._strategy_flag, "cluster_environment", None): if self._cluster_environment: - raise MisconfigurationException("cluster_environment set through both strategy class and plugins, choose one") + raise MisconfigurationException( + "cluster_environment set through both strategy class and plugins, choose one" + ) else: self._cluster_environment = getattr(self._strategy_flag, "cluster_environment") - amp_type = amp_type.lower() if isinstance(amp_type, str) else None self._amp_type_flag = AMPType.from_str(amp_type) if amp_type is not None else None @@ -299,18 +327,19 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl # ) self._amp_level_flag = amp_level - def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores): if num_nodes == "auto": self._num_nodes_flag = 1 - else : + else: self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 self._device_flag = devices ##### to be deleted v1.7 deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores if deprecated_devices_specific_flag: - self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores) + self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( + devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores + ) ##### deleted end if devices == "auto": if self._accelerator_flag is None: @@ -319,18 +348,23 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" ) - - def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag(self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores): + def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( + self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores + ): ##### to be deleted v1.7vbg # set devices base on num_processes, gpus, ipus, tpu_cores if devices: - rank_zero_warn(f"The flag `devices={devices}` will be ignored, instand the device specific number {deprecated_devices_specific_flag} will be used") - if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count(True) > 1: + rank_zero_warn( + f"The flag `devices={devices}` will be ignored, instand the device specific number {deprecated_devices_specific_flag} will be used" + ) + if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( + True + ) > 1: rank_zero_warn(f"more than one device specifc flag has been set") self._device_flag = deprecated_devices_specific_flag if not self._accelerator_flag: - # set accelerator type base on num_processes, gpus, ipus, tpu_cores + # set accelerator type base on num_processes, gpus, ipus, tpu_cores if num_processes: self._accelerator_flag = "cpu" if gpus: @@ -357,9 +391,10 @@ def _choose_accelerator(self): else: self._accelerator_flag = "cpu" - def _check_device_availibility(self): - for accelerator_flag, available in zip(self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True]): + for accelerator_flag, available in zip( + self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True] + ): if self._accelerator_flag == accelerator_flag: if not available: raise DeviceNotAvailibleException(f"{accelerator_flag} not avalible") @@ -387,51 +422,28 @@ def _set_parallel_devices_and_init_accelerator(self): elif self._accelerator_flag == "gpu": self.accelerator = GPUAccelerator() if self._device_flag == "auto" or not self._device_flag: - self._device_flag = GPUAccelerator.auto_device_count() + self._device_flag = GPUAccelerator.auto_device_count() if isinstance(self._device_flag, int) or isinstance(self._device_flag, str): self._device_flag = int(self._device_flag) - self._parallel_devices = [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] + self._parallel_devices = [ + torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag) + ] elif isinstance(self._device_flag, list): self._parallel_devices = [torch.device("cuda", i) for i in self._device_flag] - elif self._accelerator_flag == "cpu": self.accelerator = CPUAccelerator() if self._device_flag == "auto" or not self._device_flag: - self._device_flag = CPUAccelerator.auto_device_count() + self._device_flag = CPUAccelerator.auto_device_count() if not isinstance(self._device_flag, int): raise MisconfigurationException( "The flag `devices` must be an int with `accelerator='cpu'`," f" got `devices={self._device_flag}` instead." ) -<<<<<<< HEAD - self.num_processes = self.devices - return True - return False - - @property - def use_dp(self) -> bool: - return self._strategy_type == _StrategyType.DP - - @property - def use_ddp(self) -> bool: - return self._strategy_type in ( - _StrategyType.BAGUA, - _StrategyType.DDP, - _StrategyType.DDP_SPAWN, - _StrategyType.DDP_SHARDED, - _StrategyType.DDP_SHARDED_SPAWN, - _StrategyType.DDP_FULLY_SHARDED, - _StrategyType.DEEPSPEED, - _StrategyType.TPU_SPAWN, - ) -======= self._parallel_devices = [torch.device("cpu")] * self._device_flag ->>>>>>> dccae1d6f (update) self._gpus = self._device_flag - def _choose_and_init_cluster_environment(self): self.cluster_environment = LightningEnvironment() if isinstance(self._cluster_environment, ClusterEnvironment): @@ -444,23 +456,15 @@ def _choose_and_init_cluster_environment(self): if env_type.detect(): self.cluster_environment = env_type() -<<<<<<< HEAD - @property - def use_bagua(self) -> bool: - return self._strategy_type == _StrategyType.BAGUA @property def _is_sharded_training_type(self) -> bool: return isinstance(self._strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)) -======= ->>>>>>> dccae1d6f (update) def _is_slurm_managing_tasks(self): - """ - used by choosing cluster enviroment - """ + """used by choosing cluster enviroment.""" if ( - #(not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") + # (not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") # the above logic moved to _select_strategy(), only check _is_slurm_managing_tasks() # when strategy flag is ddp or ddp2 not SLURMEnvironment.detect() @@ -479,7 +483,7 @@ def _choose_strategy(self): if self._accelerator_flag == "ipu": self._strategy_flag = "ipu" elif self._accelerator_flag == "tpu": - if self._parallel_devices and len(self._parallel_devices)>1: + if self._parallel_devices and len(self._parallel_devices) > 1: self._strategy_flag = "tpu_spawn" else: self._srategy_flag = SingleTPUStrategy() @@ -490,13 +494,12 @@ def _choose_strategy(self): self._strategy_flag = "ddp" elif len(self._parallel_devices) <= 1: device = torch.device("cuda") if self._accelerator_flag == "gpu" else "cpu" - self._strategy_flag = SingleDeviceStrategy(device = device) + self._strategy_flag = SingleDeviceStrategy(device=device) elif len(self._parallel_devices) > 1: self._strategy_flag = "ddp_spawn" else: self._strategy_flag = "ddp" - def _strategy_fallbacks(self): _strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if _strategy_flag == "ddp_cpu": @@ -505,7 +508,7 @@ def _strategy_fallbacks(self): "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) - if self._device_flag ==1 and self._num_nodes_flag > 1: + if self._device_flag == 1 and self._num_nodes_flag > 1: _strategy_flag = "ddp" else: _strategy_flag = "ddp_spawn" @@ -513,12 +516,12 @@ def _strategy_fallbacks(self): rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) - if "ddp_spawn" in _strategy_flag and (TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks()): + if "ddp_spawn" in _strategy_flag and ( + TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() + ): _strategy_flag = "ddp" if _strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": - rank_zero_warn( - f"{_strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`." - ) + rank_zero_warn(f"{_strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") _strategy_flag = "ddp" if _strategy_flag: self._strategy_flag = _strategy_flag @@ -537,7 +540,7 @@ def _check_capatibility_and_init_precision(self): self.precision_plugin = self._precision_flag return - if self._accelerator_flag =="ipu": + if self._accelerator_flag == "ipu": self.precision_plugin = IPUPrecisionPlugin(self._precision_flag) if self._accelerator_flag == "tpu": if self._precision_flag == 32: @@ -551,7 +554,9 @@ def _check_capatibility_and_init_precision(self): ) self.precision_plugin = TPUBf16PrecisionPlugin() if self._strategy_flag == "deepspeed" or isinstance(self._strategy_flag, DeepSpeedStrategy): - self.precision_plugin = DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) + self.precision_plugin = DeepSpeedPrecisionPlugin( + self._precision_flag, self._amp_type_flag, self._amp_level_flag + ) if self._precision_flag == 32: self.precision_plugin = PrecisionPlugin() @@ -575,7 +580,7 @@ def _check_capatibility_and_init_precision(self): ) if self._amp_type_flag == AMPType.NATIVE: - device = "cpu" if self._accelerator_flag=="cpu" else "cuda" + device = "cpu" if self._accelerator_flag == "cpu" else "cuda" if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) @@ -595,17 +600,17 @@ def _precision_misconfig_check(self): f"`Trainer(accelerator='ipu', precision={self._precision_flag!r})` is not supported." ) if self._accelerator_flag == "tpu" and self._precision_flag == 64: - raise MisconfigurationException( - "`Trainer(accelerator='tpu', precision=64)` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" - " requesting this feature." - ) + raise MisconfigurationException( + "`Trainer(accelerator='tpu', precision=64)` is not implemented." + " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " requesting this feature." + ) if self._precision_flag == 16 and self._accelerator_flag == "cpu" and self._amp_type_flag == AMPType.APEX: - # apex was explicitly passed, not a good idea to silently switch to native AMP - raise MisconfigurationException( - "You passed `Trainer(accelerator='cpu', precision=16, amp_type='apex')`" - " but apex AMP not supported on CPU." - ) + # apex was explicitly passed, not a good idea to silently switch to native AMP + raise MisconfigurationException( + "You passed `Trainer(accelerator='cpu', precision=16, amp_type='apex')`" + " but apex AMP not supported on CPU." + ) if self._precision_flag == "bf16" and self._amp_type_flag != AMPType.NATIVE: raise MisconfigurationException( f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but it's not supported." @@ -618,7 +623,6 @@ def _precision_misconfig_check(self): "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`." ) - def _lazy_init_strategy(self): # set strategy properties self.strategy.accelerator = self.accelerator @@ -631,6 +635,7 @@ def _lazy_init_strategy(self): self.strategy.parallel_devices = self._parallel_devices from pytorch_lightning.utilities import _IS_INTERACTIVE + interactive_compatible_strategy = ("dp", "ddp_spawn", "ddp_sharded_spawn", "tpu_spawn") if _IS_INTERACTIVE and self.strategy.distributed_backend not in interactive_compatible_strategy: raise MisconfigurationException( @@ -702,13 +707,22 @@ def num_gpus(self) -> int: def gpus(self): return self._gpus if isinstance(self.accelerator, GPUAccelerator) else None - def is_distributed(self): # Used for custom plugins. # Custom plugins should implement is_distributed property. if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator): return self.strategy.is_distributed - distributed_strategy = (DDP2Strategy, DDPStrategy, DDPSpawnShardedStrategy, DDPShardedStrategy, DDPFullyShardedStrategy, DDPSpawnStrategy, DeepSpeedStrategy, TPUSpawnStrategy, HorovodStrategy) + distributed_strategy = ( + DDP2Strategy, + DDPStrategy, + DDPSpawnShardedStrategy, + DDPShardedStrategy, + DDPFullyShardedStrategy, + DDPSpawnStrategy, + DeepSpeedStrategy, + TPUSpawnStrategy, + HorovodStrategy, + ) is_distributed = isinstance(self.strategy, distributed_strategy) if isinstance(self.accelerator, TPUAccelerator): is_distributed |= self.strategy.is_distributed diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 93fd6187be1ea..bd648dd99d332 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -435,23 +435,23 @@ def __init__( self._data_connector = DataConnector(self, multiple_trainloader_mode) self._accelerator_connector = AcceleratorConnector( - num_processes = num_processes, - devices = devices, - tpu_cores = tpu_cores, - ipus = ipus, - accelerator = accelerator, - strategy = strategy, - gpus = gpus, - gpu_ids = gpu_ids, - num_nodes = num_nodes, - sync_batchnorm = sync_batchnorm, - benchmark = benchmark, - replace_sampler_ddp = replace_sampler_ddp, - deterministic = deterministic, - precision = precision, - amp_type = amp_backend, - amp_level = amp_level, - plugins = plugins, + num_processes=num_processes, + devices=devices, + tpu_cores=tpu_cores, + ipus=ipus, + accelerator=accelerator, + strategy=strategy, + gpus=gpus, + gpu_ids=gpu_ids, + num_nodes=num_nodes, + sync_batchnorm=sync_batchnorm, + benchmark=benchmark, + replace_sampler_ddp=replace_sampler_ddp, + deterministic=deterministic, + precision=precision, + amp_type=amp_backend, + amp_level=amp_level, + plugins=plugins, ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self._callback_connector = CallbackConnector(self) diff --git a/pytorch_lightning/utilities/exceptions.py b/pytorch_lightning/utilities/exceptions.py index 24fbbac44d156..a0de06036792f 100644 --- a/pytorch_lightning/utilities/exceptions.py +++ b/pytorch_lightning/utilities/exceptions.py @@ -16,9 +16,11 @@ class MisconfigurationException(Exception): """Exception used to inform users of misuse with PyTorch Lightning.""" + class DeviceNotAvailibleException(Exception): """Exception used to inform users that requested devices are not availible.""" + class ImpactableConfigurationException(Exception): """Exception used to inform users that configuration impactable with each other.""" diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 602c8b50c92e9..24355097ce34f 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -133,7 +133,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: else: _IPU_AVAILABLE = False -_GPU_AVAILABLE = torch.cuda.is_available() and torch.cuda.device_count()>0 +_GPU_AVAILABLE = torch.cuda.is_available() and torch.cuda.device_count() > 0 # experimental feature within PyTorch Lightning. From c2730f941ac3c78ee2e3b25b09a32232b6cd011b Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Tue, 25 Jan 2022 16:40:39 -0800 Subject: [PATCH 05/69] update --- .../connectors/accelerator_connector.py | 126 ++++++++++-------- 1 file changed, 72 insertions(+), 54 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index ec30e3469d451..344d32dcfb310 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -14,8 +14,7 @@ import logging import os -from typing import List, Optional, Sequence, Union -from weakref import proxy +from typing import List, Optional, Union import torch @@ -57,31 +56,23 @@ DDPStrategy, DeepSpeedStrategy, HorovodStrategy, - IPUStrategy, SingleDeviceStrategy, SingleTPUStrategy, Strategy, StrategyRegistry, TPUSpawnStrategy, ) -from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, AMPType, device_parser -from pytorch_lightning.utilities.enums import PrecisionType -from pytorch_lightning.utilities.exceptions import ( - DeviceNotAvailibleException, - ImpactableConfigurationException, - MisconfigurationException, -) -from pytorch_lightning.utilities.imports import ( - _GPU_AVAILABLE, - _HOROVOD_AVAILABLE, - _IPU_AVAILABLE, - _TORCH_GREATER_EQUAL_1_8, - _TPU_AVAILABLE, +from pytorch_lightning.utilities import ( + _StrategyType, + AMPType, + device_parser, + rank_zero_deprecation, + rank_zero_info, + rank_zero_warn, ) -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn - -if _HOROVOD_AVAILABLE: - import horovod.torch as hvd +from pytorch_lightning.utilities.enums import PrecisionType +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _GPU_AVAILABLE, _HOROVOD_AVAILABLE, _IPU_AVAILABLE, _TPU_AVAILABLE log = logging.getLogger(__name__) @@ -118,7 +109,8 @@ def __init__( B. strategy flag could be : 1. strategy class 2. strategy str registered with strategyRegister - 3. strategy str in _strategy_type enum which listed in each strategy as backend (registed these too, and _strategy_type could be deprecated) + 3. strategy str in _strategy_type enum which listed in each strategy as + backend (registed these too, and _strategy_type could be deprecated) C. plugins flag could be: 1. List of str, which could contains: @@ -168,7 +160,7 @@ def __init__( # --Cluster_environment----------------------------------------------------- self._choose_and_init_cluster_environment() - # --Strategy Part 1 : choose strategy --------------------------------------- + # --Strategy Part 1 : choose strategy and init strategy --------------------------------------- if self._strategy_flag is None: self._choose_strategy() # Reset strategy even user has specificed one @@ -192,7 +184,8 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl 2. accelerator: if accelerator flag is Accelerator related flag or class, set self._acceelrator_flag; If accelerator is strategy related, logic handled in 1 above 3. precision could be set by precision and plugins flag - 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), set checkpoint_io and cluster_environment + 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), + set checkpoint_io and cluster_environment """ ( self._strategy_flag, @@ -256,6 +249,11 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl self._strategy_flag = accelerator if precision: + if not PrecisionType.supported_type(precision): + raise MisconfigurationException( + f"Precision {repr(precision)} is invalid. " + f"Allowed precision values: {PrecisionType.supported_types()}" + ) self._precision_flag = precision # handle duplications and conflict if plugins: @@ -285,7 +283,9 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl elif isinstance(plugin, ClusterEnvironment): self._cluster_environment = plugin else: - raise MisconfigurationException(f"Does not recognize flag {plugin}") + raise MisconfigurationException( + f"Found invalid type for plugin {plugin}. Expected a precision or training type plugin." + ) # if user pass in a strategy class which has accelerator, precision, checkpoint or cluster env set up if self._strategy_flag and isinstance(self._strategy_flag, Strategy): @@ -297,9 +297,11 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl else: self._accelerator_flag = self._strategy_flag.accelerator if self._strategy_flag.precision_plugin: - # precision has default value 32, we can not tell whether user set it or not [RFC] remove default from trainer? + # precision has default value 32, we can not tell whether user set it or not + # [RFC] remove default from trainer? # if self._precision_flag: - # raise MisconfigurationException("precision set through both strategy class and flags, choose one place to set") + # raise MisconfigurationException("precision set through both strategy class and flags, + # choose one place to set") # else: self._precision_flag = self._strategy_flag.precision_plugin if self._strategy_flag.checkpoint_io: @@ -318,13 +320,13 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl self._cluster_environment = getattr(self._strategy_flag, "cluster_environment") amp_type = amp_type.lower() if isinstance(amp_type, str) else None - self._amp_type_flag = AMPType.from_str(amp_type) if amp_type is not None else None + self._amp_type_flag = AMPType.from_str(amp_type) + print(f"a:{amp_type}, b{self._amp_type_flag}") - # TODO still working on these flags - # if amp_level is not None and self._amp_type_flag != AMPType.APEX: - # raise MisconfigurationException( - # f"You have asked for `amp_level={self._amp_level_flag!r}` but it's only supported with `amp_backend='apex'`." - # ) + if amp_level is not None and self._amp_type_flag != AMPType.APEX: + raise MisconfigurationException( + f"You have asked for `amp_level={amp_level!r}` but it's only supported with `amp_backend='apex'`." + ) self._amp_level_flag = amp_level def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores): @@ -334,13 +336,13 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 self._device_flag = devices - ##### to be deleted v1.7 + # --- to be deleted v1.7 deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores if deprecated_devices_specific_flag: self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores ) - ##### deleted end + # --- deleted end if devices == "auto": if self._accelerator_flag is None: raise MisconfigurationException( @@ -351,16 +353,17 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores ): - ##### to be deleted v1.7vbg + # ---- to be deleted v1.7vbg # set devices base on num_processes, gpus, ipus, tpu_cores if devices: rank_zero_warn( - f"The flag `devices={devices}` will be ignored, instand the device specific number {deprecated_devices_specific_flag} will be used" + f"The flag `devices={devices}` will be ignored, " + f"instand the device specific number {deprecated_devices_specific_flag} will be used" ) if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( True ) > 1: - rank_zero_warn(f"more than one device specifc flag has been set") + rank_zero_warn("more than one device specifc flag has been set") self._device_flag = deprecated_devices_specific_flag if not self._accelerator_flag: @@ -373,7 +376,7 @@ def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( self._accelerator_flag = "tpu" if ipus: self._accelerator_flag = "ipu" - #### delete end + # --- delete end def _choose_accelerator(self): if self._accelerator_flag == "auto": @@ -391,13 +394,13 @@ def _choose_accelerator(self): else: self._accelerator_flag = "cpu" - def _check_device_availibility(self): - for accelerator_flag, available in zip( - self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True] - ): - if self._accelerator_flag == accelerator_flag: - if not available: - raise DeviceNotAvailibleException(f"{accelerator_flag} not avalible") + # def _check_device_availibility(self): + # for accelerator_flag, available in zip( + # self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True] + # ): + # if self._accelerator_flag == accelerator_flag: + # if not available: + # raise DeviceNotAvailibleException(f"{accelerator_flag} not avalible") # TODO in progress for setting up devices def _set_parallel_devices_and_init_accelerator(self): @@ -493,7 +496,12 @@ def _choose_strategy(self): elif TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks(): self._strategy_flag = "ddp" elif len(self._parallel_devices) <= 1: - device = torch.device("cuda") if self._accelerator_flag == "gpu" else "cpu" + # device = torch.device("cuda", 1) if self._accelerator_flag == "gpu" else "cpu" + device = ( + device_parser.determine_root_gpu_device(self._parallel_devices) + if self._accelerator_flag == "gpu" + else "cpu" + ) self._strategy_flag = SingleDeviceStrategy(device=device) elif len(self._parallel_devices) > 1: self._strategy_flag = "ddp_spawn" @@ -583,14 +591,17 @@ def _check_capatibility_and_init_precision(self): device = "cpu" if self._accelerator_flag == "cpu" else "cuda" if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): - return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + self.precision_plugin = ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) if isinstance(self.strategy, DDPFullyShardedStrategy): - return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - return NativeMixedPrecisionPlugin(self._precision_flag, device) + self.precision_plugin = FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + self.precision_plugin = NativeMixedPrecisionPlugin(self._precision_flag, device) + if self._amp_type_flag == AMPType.APEX: self._amp_level_flag = self._amp_level_flag or "O2" self.precision_plugin = ApexMixedPrecisionPlugin(self._amp_level_flag) - self.precision_plugin = PrecisionPlugin() + + if not self.precision_plugin: + self.precision_plugin = PrecisionPlugin() def _precision_misconfig_check(self): @@ -613,10 +624,9 @@ def _precision_misconfig_check(self): ) if self._precision_flag == "bf16" and self._amp_type_flag != AMPType.NATIVE: raise MisconfigurationException( - f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but it's not supported." - " Try using `amp_type='native'` instead." + f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but " + "it's not supported. Try using `amp_type='native'` instead." ) - if self._precision_flag in (16, "bf16") and self._amp_type_flag == AMPType.APEX: if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy, DDPFullyShardedStrategy)): raise MisconfigurationException( @@ -676,7 +686,7 @@ def num_nodes(self): @property def num_processes(self): - return self.devices + return self.devices if self.devices is not None else 1 @property def root_gpu(self) -> Optional[int]: @@ -707,6 +717,7 @@ def num_gpus(self) -> int: def gpus(self): return self._gpus if isinstance(self.accelerator, GPUAccelerator) else None + @property def is_distributed(self): # Used for custom plugins. # Custom plugins should implement is_distributed property. @@ -728,12 +739,19 @@ def is_distributed(self): is_distributed |= self.strategy.is_distributed return is_distributed + @property def has_ipu(self): return isinstance(self.accelerator, IPUAccelerator) + @property + def use_ipu(self): + return self.has_ipu + + @property def has_tpu(self): return isinstance(self.accelerator, TPUAccelerator) + @property def use_dp(self): return isinstance(self.strategy, DataParallelStrategy) From c01aee5af7c4fc39c108a801e9f0abbe66c3a337 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Tue, 25 Jan 2022 16:45:40 -0800 Subject: [PATCH 06/69] remove print --- .../trainer/connectors/accelerator_connector.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 344d32dcfb310..9eaab976b8a38 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -135,7 +135,7 @@ def __init__( # --Parsing_flags------------------------------------------------------ # Get registered strategies, existing accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() - print(self._existing_strategies_str) + # print(self._existing_strategies_str) self._existing_accelerator_type = ["tpu", "ipu", "gpu", "cpu"] self._supported_precision = PrecisionType.supported_types() @@ -264,7 +264,6 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl if plugins: plugins = [plugins] if not isinstance(plugins, list) else plugins for plugin in plugins: - print(plugin) if isinstance(plugin, Strategy) or isinstance(plugin, str) and plugin in self._existing_strategies_str: self._strategy_flag = plugin rank_zero_deprecation( @@ -321,7 +320,6 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl amp_type = amp_type.lower() if isinstance(amp_type, str) else None self._amp_type_flag = AMPType.from_str(amp_type) - print(f"a:{amp_type}, b{self._amp_type_flag}") if amp_level is not None and self._amp_type_flag != AMPType.APEX: raise MisconfigurationException( @@ -535,14 +533,12 @@ def _strategy_fallbacks(self): self._strategy_flag = _strategy_flag def _init_strategy(self): - print(self._strategy_flag) if isinstance(self._strategy_flag, str): self.strategy = StrategyRegistry.get(self._strategy_flag) else: self.strategy = self._strategy_flag def _check_capatibility_and_init_precision(self): - print(self._precision_flag) self._precision_misconfig_check() if isinstance(self._precision_flag, PrecisionPlugin): self.precision_plugin = self._precision_flag From d45eba0b56ca934f7fdbdccca3e9c424caf51aa5 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 26 Jan 2022 17:17:46 -0800 Subject: [PATCH 07/69] fix more tests --- .../connectors/accelerator_connector.py | 82 ++++++++---- .../test_accelerator_connector.py | 10 +- tests/trainer/test_trainer.py | 123 +++++++----------- 3 files changed, 112 insertions(+), 103 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 9eaab976b8a38..e89cb5d2ebf48 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -56,6 +56,7 @@ DDPStrategy, DeepSpeedStrategy, HorovodStrategy, + ParallelStrategy, SingleDeviceStrategy, SingleTPUStrategy, Strategy, @@ -72,7 +73,7 @@ ) from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _GPU_AVAILABLE, _HOROVOD_AVAILABLE, _IPU_AVAILABLE, _TPU_AVAILABLE +from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE, _IPU_AVAILABLE, _TPU_AVAILABLE log = logging.getLogger(__name__) @@ -131,7 +132,7 @@ def __init__( C. When multiple flag set to the same thing? (ignore? not handled for now) """ - + torch.backends.cudnn.benchmark = benchmark # --Parsing_flags------------------------------------------------------ # Get registered strategies, existing accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() @@ -164,7 +165,7 @@ def __init__( if self._strategy_flag is None: self._choose_strategy() # Reset strategy even user has specificed one - self._strategy_fallbacks() + self._strategy_check_and_fallbacks() self._init_strategy() # --Precision---------------------------------------------------------------- @@ -196,6 +197,9 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl self._amp_level_flag, self._amp_type_flag, ) = (None, None, None, None, None, amp_type, amp_level) + if plugins: + plugins = [plugins] if not isinstance(plugins, list) else plugins + if strategy: self._strategy_flag = strategy if strategy == "ddp_cpu": @@ -210,24 +214,28 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl ) # handle duplications and conflict if isinstance(accelerator, Strategy) and strategy != accelerator: - raise MisconfigurationException("strategy already set through strategy flag, duplicated in accelerator") + raise MisconfigurationException( + "strategy already set through strategy flag, but have also passed in through accelerator" + ) if ( isinstance(accelerator, str) and accelerator in self._existing_strategies_str and strategy != accelerator ): raise MisconfigurationException( - "strategy str already set through strategy flag, duplicated in accelerator" + "strategy str already set through strategy flag, but have also passed in through accelerator" ) if plugins: for plugin in plugins: - if isinstance(plugin, Strategy) and strategy != plugin: + if isinstance(plugin, Strategy): raise MisconfigurationException( - "strategy already set through strategy flag, duplicated in plugins" + f"You have passed `Trainer(strategy)`" + f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) if isinstance(plugin, str) and plugin in self._existing_strategies_str: raise MisconfigurationException( - "strategy already set through strategy flag, duplicated in plugins" + f"You have passed `Trainer(strategy)`" + f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) if ( @@ -262,7 +270,6 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl raise MisconfigurationException("precision set in both precision flag and plugin flag") if plugins: - plugins = [plugins] if not isinstance(plugins, list) else plugins for plugin in plugins: if isinstance(plugin, Strategy) or isinstance(plugin, str) and plugin in self._existing_strategies_str: self._strategy_flag = plugin @@ -334,13 +341,13 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 self._device_flag = devices - # --- to be deleted v1.7 + # Delete when remove num_processes, gpus, ipus and tpu_cores deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores if deprecated_devices_specific_flag: self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores ) - # --- deleted end + # Delete end if devices == "auto": if self._accelerator_flag is None: raise MisconfigurationException( @@ -351,7 +358,6 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores ): - # ---- to be deleted v1.7vbg # set devices base on num_processes, gpus, ipus, tpu_cores if devices: rank_zero_warn( @@ -366,15 +372,14 @@ def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( if not self._accelerator_flag: # set accelerator type base on num_processes, gpus, ipus, tpu_cores - if num_processes: - self._accelerator_flag = "cpu" - if gpus: - self._accelerator_flag = "gpu" - if tpu_cores: - self._accelerator_flag = "tpu" if ipus: self._accelerator_flag = "ipu" - # --- delete end + if tpu_cores: + self._accelerator_flag = "tpu" + if gpus: + self._accelerator_flag = "gpu" + if num_processes: + self._accelerator_flag = "cpu" def _choose_accelerator(self): if self._accelerator_flag == "auto": @@ -382,7 +387,7 @@ def _choose_accelerator(self): self._accelerator_flag = "tpu" elif _IPU_AVAILABLE: self._accelerator_flag = "ipu" - elif _GPU_AVAILABLE: + elif torch.cuda.is_available() and torch.cuda.device_count() > 0: self._accelerator_flag = "gpu" else: self._accelerator_flag = "cpu" @@ -487,7 +492,7 @@ def _choose_strategy(self): if self._parallel_devices and len(self._parallel_devices) > 1: self._strategy_flag = "tpu_spawn" else: - self._srategy_flag = SingleTPUStrategy() + self._srategy_flag = SingleTPUStrategy(device=self._parallel_devices[0]) else: if self._num_nodes_flag > 1: self._strategy_flag = "ddp" @@ -506,8 +511,10 @@ def _choose_strategy(self): else: self._strategy_flag = "ddp" - def _strategy_fallbacks(self): + def _strategy_check_and_fallbacks(self): + # fallback apply to user pass in object as well, so get the _strategy_flag first _strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag + if _strategy_flag == "ddp_cpu": if _TPU_AVAILABLE: raise MisconfigurationException( @@ -529,6 +536,12 @@ def _strategy_fallbacks(self): if _strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn(f"{_strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") _strategy_flag = "ddp" + if isinstance(self.accelerator, TPUAccelerator) and "tpu" not in _strategy_flag: + raise ValueError( + "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy`," + f" found {_strategy_flag}." + ) + if _strategy_flag: self._strategy_flag = _strategy_flag @@ -600,7 +613,7 @@ def _check_capatibility_and_init_precision(self): self.precision_plugin = PrecisionPlugin() def _precision_misconfig_check(self): - + # TODO change exception type to ImpactableConfigurationException if self._accelerator_flag == "ipu": if self._precision_flag not in (16, 32): raise MisconfigurationException( @@ -612,6 +625,13 @@ def _precision_misconfig_check(self): " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" " requesting this feature." ) + if self._accelerator_flag == "tpu" and isinstance( + self._precision_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin) + ): + raise ValueError( + f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," + f" found: {self.strategy.precision_plugin}." + ) if self._precision_flag == 16 and self._accelerator_flag == "cpu" and self._amp_type_flag == AMPType.APEX: # apex was explicitly passed, not a good idea to silently switch to native AMP raise MisconfigurationException( @@ -694,7 +714,12 @@ def root_gpu(self) -> Optional[int]: @property def devices(self): - return len(self._parallel_devices) + if isinstance(self.strategy, SingleDeviceStrategy): + return 1 + elif isinstance(self.strategy, ParallelStrategy): + return len(self.strategy.parallel_devices) + else: + return 0 @property def tpu_cores(self) -> int: @@ -706,13 +731,20 @@ def ipus(self) -> int: @property def num_gpus(self) -> int: - return self.devices + if isinstance(self.accelerator, GPUAccelerator): + return self.devices + else: + return 0 # def parallel_device_ids(): @property def gpus(self): return self._gpus if isinstance(self.accelerator, GPUAccelerator) else None + @property + def parallel_device_ids(self): + return [i for i in range(len(self.parallel_devices))] + @property def is_distributed(self): # Used for custom plugins. diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 8617b5a2c8095..a6b65e9542f0c 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -601,8 +601,9 @@ def test_exception_when_strategy_used_with_accelerator(): def test_exception_when_strategy_used_with_plugins(): - with pytest.raises(MisconfigurationException, match="only specify one training type plugin, but you have passed"): - Trainer(plugins="ddp_find_unused_parameters_false", strategy="ddp_spawn") + with pytest.raises(MisconfigurationException, match="only specify one strategy, but you have passed"): + with pytest.deprecated_call(match=r"`strategy` to the `plugins` flag in Trainer has been deprecated"): + Trainer(plugins="ddp_find_unused_parameters_false", strategy="ddp_spawn") def test_exception_invalid_strategy(): @@ -898,13 +899,14 @@ def test_unsupported_tpu_choice(monkeypatch): with pytest.raises(MisconfigurationException, match=r"accelerator='tpu', precision=64\)` is not implemented"): Trainer(accelerator="tpu", precision=64) + # if user haven't set strategy, accelerator_connector will choose the TPUSingleStrategy or TPUSpawnStrategy with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but native AMP is not supported"): - Trainer(accelerator="tpu", precision=16) + Trainer(accelerator="tpu", precision=16, strategy="ddp") with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but apex AMP is not supported"): - Trainer(accelerator="tpu", precision=16, amp_backend="apex") + Trainer(accelerator="tpu", precision=16, amp_backend="apex", strategy="single_device") def test_unsupported_ipu_choice(monkeypatch): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 587ff0b7b9f72..32aa94b8e0b2c 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1177,81 +1177,75 @@ def val_dataloader(self): [ ( dict(accelerator=None, gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="single_device", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="dp", gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp", gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp", num_nodes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp_cpu", num_processes=2, gpus=None), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp2", gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator=None, gpus=1), - dict(_strategy_type=None, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type="single_device", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(accelerator="dp", gpus=1), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(accelerator="ddp", gpus=1), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(accelerator="ddp_cpu", num_processes=2, gpus=1), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp2", gpus=1), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(accelerator=None, gpus=2), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=2 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="dp", gpus=2), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1), + dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="ddp", gpus=2), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="ddp2", gpus=2), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1), + dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="ddp2", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="dp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ], ) @@ -1264,9 +1258,9 @@ def test_trainer_config(trainer_kwargs, expected, monkeypatch): else: with pytest.deprecated_call(match=r"accelerator='.*'\)` has been deprecated in v1.5"): trainer = Trainer(**trainer_kwargs) - assert len(expected) == 4 + assert len(expected) == 3 for k, v in expected.items(): - assert getattr(trainer, k) == v, f"Failed {k}: {v}" + assert getattr(trainer, k) == v, f"Failed on {trainer_kwargs}, where {k}={ getattr(trainer, k)}, not {v}" def test_trainer_subclassing(): @@ -2103,122 +2097,107 @@ def training_step(self, batch, batch_idx): [ ( dict(strategy=None, gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="single_device", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="dp", gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp", gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp", num_nodes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp2", gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy=None, gpus=1), - dict(_strategy_type=None, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type="single_device", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy="dp", gpus=1), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy="ddp", gpus=1), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy="ddp_spawn", gpus=1), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy="ddp2", gpus=1), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=1, num_processes=1), + dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy=None, gpus=2), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=2 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="dp", gpus=2), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1), + dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="ddp", gpus=2), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="ddp2", gpus=2), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1), + dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="ddp2", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="dp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp_spawn", num_processes=2, gpus=None), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp_spawn", num_processes=1, gpus=None), - dict(_strategy_type=None, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=1), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp_fully_sharded", gpus=1), - dict( - _strategy_type=_StrategyType.DDP_FULLY_SHARDED, - _device_type=_AcceleratorType.GPU, - num_gpus=1, - num_processes=1, - ), + dict(_strategy_type=_StrategyType.DDP_FULLY_SHARDED, _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy=DDPSpawnStrategy(), num_processes=2, gpus=None), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy=DDPSpawnStrategy(), gpus=2), - dict( - _strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1 - ), + dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DDPStrategy(), num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0, num_processes=2), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy=DDPStrategy(), gpus=2), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1), + dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DDP2Strategy(), gpus=2), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1), + dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DataParallelStrategy(), gpus=2), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1), + dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DDPFullyShardedStrategy(), gpus=2), @@ -2226,7 +2205,6 @@ def training_step(self, batch, batch_idx): _strategy_type=_StrategyType.DDP_FULLY_SHARDED, _device_type=_AcceleratorType.GPU, num_gpus=2, - num_processes=1, ), ), ( @@ -2235,14 +2213,11 @@ def training_step(self, batch, batch_idx): _strategy_type=_StrategyType.DDP_SHARDED_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2, - num_processes=1, ), ), ( dict(strategy=DDPShardedStrategy(), gpus=2), - dict( - _strategy_type=_StrategyType.DDP_SHARDED, _device_type=_AcceleratorType.GPU, num_gpus=2, num_processes=1 - ), + dict(_strategy_type=_StrategyType.DDP_SHARDED, _device_type=_AcceleratorType.GPU, num_gpus=2), ), ], ) @@ -2251,6 +2226,6 @@ def test_trainer_config_strategy(trainer_kwargs, expected, monkeypatch): monkeypatch.setattr(torch.cuda, "is_available", lambda: True) monkeypatch.setattr(torch.cuda, "device_count", lambda: trainer_kwargs["gpus"]) trainer = Trainer(**trainer_kwargs) - assert len(expected) == 4 + assert len(expected) == 3 for k, v in expected.items(): - assert getattr(trainer, k) == v, f"Failed {k}: {v}" + assert getattr(trainer, k) == v, f"Failed on {trainer_kwargs}, where {k}={ getattr(trainer, k)}, not {v}" From ec17b316d8034f161458fe45c8e282679b395762 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 27 Jan 2022 12:13:05 -0800 Subject: [PATCH 08/69] change trainer.gpus --- .../trainer/connectors/accelerator_connector.py | 15 +++++++++------ pytorch_lightning/trainer/trainer.py | 2 +- tests/models/test_gpu.py | 1 + 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index e89cb5d2ebf48..9dd1ee343bc1b 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -133,6 +133,7 @@ def __init__( """ torch.backends.cudnn.benchmark = benchmark + self._gpus = gpus # --Parsing_flags------------------------------------------------------ # Get registered strategies, existing accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() @@ -175,7 +176,7 @@ def __init__( self._lazy_init_strategy() # set properties not used in accelerator_connector. TODO move out of this file - # self.gpus = gpus or devices + self.replace_sampler_ddp = replace_sampler_ddp def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): @@ -343,12 +344,12 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce self._device_flag = devices # Delete when remove num_processes, gpus, ipus and tpu_cores deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores - if deprecated_devices_specific_flag: + if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores ) # Delete end - if devices == "auto": + if self._device_flag == "auto": if self._accelerator_flag is None: raise MisconfigurationException( f"You passed `devices={devices}` but haven't specified" @@ -364,6 +365,7 @@ def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( f"The flag `devices={devices}` will be ignored, " f"instand the device specific number {deprecated_devices_specific_flag} will be used" ) + gpus = int(gpus) if isinstance(gpus, str) and gpus.isnumeric() else gpus if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( True ) > 1: @@ -448,7 +450,7 @@ def _set_parallel_devices_and_init_accelerator(self): ) self._parallel_devices = [torch.device("cpu")] * self._device_flag - self._gpus = self._device_flag + self._gpus = self._device_flag if not self._gpus else self._gpus def _choose_and_init_cluster_environment(self): self.cluster_environment = LightningEnvironment() @@ -739,11 +741,12 @@ def num_gpus(self) -> int: # def parallel_device_ids(): @property def gpus(self): - return self._gpus if isinstance(self.accelerator, GPUAccelerator) else None + return self._gpus + # if isinstance(self.accelerator, GPUAccelerator) else 0 @property def parallel_device_ids(self): - return [i for i in range(len(self.parallel_devices))] + return [i for i in range(len(self.parallel_devices))] if isinstance(self.accelerator, GPUAccelerator) else None @property def is_distributed(self): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index bd648dd99d332..70f72bd2488cc 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -2006,7 +2006,7 @@ def devices(self) -> Optional[Union[List[int], str, int]]: @property def data_parallel_device_ids(self) -> Optional[List[int]]: - return self._accelerator_connector.parallel_devices + return self._accelerator_connector.parallel_device_ids @property def lightning_module(self) -> "pl.LightningModule": diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index c494c0c1c18e6..190936096ddef 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -243,6 +243,7 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus trainer = Trainer(gpus=gpus) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) assert trainer._accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert trainer.gpus == gpus From ffeea284e3acde0a9982879160d9262e699ebc31 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 28 Jan 2022 12:58:53 -0800 Subject: [PATCH 09/69] fix tests --- .../connectors/accelerator_connector.py | 176 +++++++++--------- .../test_accelerator_connector.py | 8 +- tests/accelerators/test_ipu.py | 5 +- tests/accelerators/test_tpu.py | 13 +- 4 files changed, 102 insertions(+), 100 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 9dd1ee343bc1b..527b64b5625e1 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -133,7 +133,7 @@ def __init__( """ torch.backends.cudnn.benchmark = benchmark - self._gpus = gpus + # --Parsing_flags------------------------------------------------------ # Get registered strategies, existing accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() @@ -154,9 +154,9 @@ def __init__( # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._choose_accelerator() - # else: - # # [RFC] move to XAccelerator class init? - # self._check_device_availibility() + else: + # # [RFC] move to XAccelerator class init? + self._check_device_availibility() self._set_parallel_devices_and_init_accelerator() # --Cluster_environment----------------------------------------------------- @@ -170,13 +170,11 @@ def __init__( self._init_strategy() # --Precision---------------------------------------------------------------- - self._check_capatibility_and_init_precision() + self.precision_plugin = self._check_capatibility_and_init_precision() # --Strategy Part 2 : init Strategy and set Strategy properties ------------- self._lazy_init_strategy() - # set properties not used in accelerator_connector. TODO move out of this file - self.replace_sampler_ddp = replace_sampler_ddp def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): @@ -239,23 +237,24 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) - if ( - accelerator in self._existing_accelerator_type - or accelerator == "auto" - or isinstance(accelerator, Accelerator) - ): - self._accelerator_flag = accelerator - elif accelerator in self._existing_strategies_str or isinstance(accelerator, Strategy): - rank_zero_deprecation( - f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" - f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." - ) - self._strategy_flag = accelerator - elif accelerator == "ddp_cpu": - rank_zero_warn( - "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." - ) - self._strategy_flag = accelerator + if accelerator: + if ( + accelerator in self._existing_accelerator_type + or accelerator == "auto" + or isinstance(accelerator, Accelerator) + ): + self._accelerator_flag = accelerator + elif accelerator in self._existing_strategies_str or isinstance(accelerator, Strategy): + rank_zero_deprecation( + f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" + f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." + ) + self._strategy_flag = accelerator + elif accelerator == "ddp_cpu": + rank_zero_warn( + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." + ) + self._strategy_flag = accelerator if precision: if not PrecisionType.supported_type(precision): @@ -265,10 +264,12 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl ) self._precision_flag = precision # handle duplications and conflict - if plugins: - for plugin in plugins: - if isinstance(plugin, PrecisionPlugin): - raise MisconfigurationException("precision set in both precision flag and plugin flag") + # [RFC] current logic doesn't handle precision_plugin duplication + # if plugins: + # for plugin in plugins: + # if isinstance(plugin, PrecisionPlugin): + # self._precision_flag = precision + # raise MisconfigurationException("precision set in both precision flag and plugin flag") if plugins: for plugin in plugins: @@ -279,10 +280,8 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={plugin})` instead." ) - elif ( - isinstance(plugin, PrecisionPlugin) - or isinstance(plugin, str) - and plugin in self._supported_precision + elif isinstance(plugin, PrecisionPlugin) or ( + isinstance(plugin, str) and plugin in self._supported_precision ): self._precision_flag = plugin elif isinstance(plugin, CheckpointIO): @@ -343,6 +342,7 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce self._device_flag = devices # Delete when remove num_processes, gpus, ipus and tpu_cores + self._gpus = gpus deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( @@ -399,18 +399,20 @@ def _choose_accelerator(self): else: self._accelerator_flag = "cpu" - # def _check_device_availibility(self): - # for accelerator_flag, available in zip( - # self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, _GPU_AVAILABLE, True] - # ): - # if self._accelerator_flag == accelerator_flag: - # if not available: - # raise DeviceNotAvailibleException(f"{accelerator_flag} not avalible") + # TODO move this to xAccelerator + def _check_device_availibility(self): + for accelerator_flag, available in zip( + self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, torch.cuda.is_available(), True] + ): + # only apply to gpu to keep backward compatibility + if self._accelerator_flag == accelerator_flag == "gpu": + if not available: + raise MisconfigurationException( + f"You choice {accelerator_flag} accelerator, but {accelerator_flag} is not available" + ) - # TODO in progress for setting up devices def _set_parallel_devices_and_init_accelerator(self): self._parallel_devices = [] - if isinstance(self._accelerator_flag, Accelerator): self.accelerator = self._accelerator_flag elif self._accelerator_flag == "tpu": @@ -419,6 +421,8 @@ def _set_parallel_devices_and_init_accelerator(self): self._device_flag = TPUAccelerator.auto_device_count() if isinstance(self._device_flag, int): self._parallel_devices = list(range(self._device_flag)) + else: + self._parallel_devices = self._device_flag elif self._accelerator_flag == "ipu": self.accelerator = IPUAccelerator() @@ -436,7 +440,7 @@ def _set_parallel_devices_and_init_accelerator(self): self._parallel_devices = [ torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag) ] - elif isinstance(self._device_flag, list): + else: self._parallel_devices = [torch.device("cuda", i) for i in self._device_flag] elif self._accelerator_flag == "cpu": @@ -471,13 +475,7 @@ def _is_sharded_training_type(self) -> bool: def _is_slurm_managing_tasks(self): """used by choosing cluster enviroment.""" - if ( - # (not self._strategy_flag=="ddp" and not self._strategy_flag=="ddp2") - # the above logic moved to _select_strategy(), only check _is_slurm_managing_tasks() - # when strategy flag is ddp or ddp2 - not SLURMEnvironment.detect() - or SLURMEnvironment.job_name() == "bash" - ): + if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash": return False total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag @@ -498,10 +496,7 @@ def _choose_strategy(self): else: if self._num_nodes_flag > 1: self._strategy_flag = "ddp" - elif TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks(): - self._strategy_flag = "ddp" elif len(self._parallel_devices) <= 1: - # device = torch.device("cuda", 1) if self._accelerator_flag == "gpu" else "cpu" device = ( device_parser.determine_root_gpu_device(self._parallel_devices) if self._accelerator_flag == "gpu" @@ -538,11 +533,13 @@ def _strategy_check_and_fallbacks(self): if _strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn(f"{_strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") _strategy_flag = "ddp" - if isinstance(self.accelerator, TPUAccelerator) and "tpu" not in _strategy_flag: - raise ValueError( - "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy`," - f" found {_strategy_flag}." - ) + # Current test check precision first. So move this test to the end for now. + # TODO update tests and uncomment this part + # if isinstance(self.accelerator, TPUAccelerator) and "tpu" not in _strategy_flag: + # raise ValueError( + # "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy`," + # f" found {_strategy_flag}." + # ) if _strategy_flag: self._strategy_flag = _strategy_flag @@ -552,39 +549,35 @@ def _init_strategy(self): self.strategy = StrategyRegistry.get(self._strategy_flag) else: self.strategy = self._strategy_flag + # print(self.strategy) def _check_capatibility_and_init_precision(self): self._precision_misconfig_check() if isinstance(self._precision_flag, PrecisionPlugin): - self.precision_plugin = self._precision_flag - return + return self._precision_flag + self.precision_plugin = None - if self._accelerator_flag == "ipu": - self.precision_plugin = IPUPrecisionPlugin(self._precision_flag) - if self._accelerator_flag == "tpu": + if isinstance(self.accelerator, IPUAccelerator): + return IPUPrecisionPlugin(self._precision_flag) + if isinstance(self.accelerator, TPUAccelerator): if self._precision_flag == 32: - self.precision_plugin = TPUPrecisionPlugin() + return TPUPrecisionPlugin() elif self._precision_flag in (16, "bf16"): if self._precision_flag == 16: - # this is not deprecated to ease transition between accelerator environments rank_zero_warn( f"You passed `Trainer(accelerator='tpu', precision=16)` but {self._amp_type_flag.value} AMP" f" is not supported with TPUs. Using `precision='bf16'` instead." ) - self.precision_plugin = TPUBf16PrecisionPlugin() + return TPUBf16PrecisionPlugin() if self._strategy_flag == "deepspeed" or isinstance(self._strategy_flag, DeepSpeedStrategy): - self.precision_plugin = DeepSpeedPrecisionPlugin( - self._precision_flag, self._amp_type_flag, self._amp_level_flag - ) + return DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) if self._precision_flag == 32: - self.precision_plugin = PrecisionPlugin() + return PrecisionPlugin() if self._precision_flag == 64: - self.precision_plugin = DoublePrecisionPlugin() + return DoublePrecisionPlugin() - # maybe convert the precision value if self._precision_flag == 16 and self._accelerator_flag == "cpu": - # this automatic switch is to ease transition between accelerator environments rank_zero_warn( "You passed `Trainer(accelerator='cpu', precision=16)` but native AMP is not supported on CPU." " Using `precision='bf16'` instead." @@ -602,40 +595,44 @@ def _check_capatibility_and_init_precision(self): device = "cpu" if self._accelerator_flag == "cpu" else "cuda" if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): - self.precision_plugin = ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) if isinstance(self.strategy, DDPFullyShardedStrategy): - self.precision_plugin = FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - self.precision_plugin = NativeMixedPrecisionPlugin(self._precision_flag, device) + return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) + return NativeMixedPrecisionPlugin(self._precision_flag, device) if self._amp_type_flag == AMPType.APEX: self._amp_level_flag = self._amp_level_flag or "O2" - self.precision_plugin = ApexMixedPrecisionPlugin(self._amp_level_flag) + return ApexMixedPrecisionPlugin(self._amp_level_flag) - if not self.precision_plugin: - self.precision_plugin = PrecisionPlugin() + raise RuntimeError("No precision set") def _precision_misconfig_check(self): # TODO change exception type to ImpactableConfigurationException - if self._accelerator_flag == "ipu": + if isinstance(self.accelerator, IPUAccelerator): if self._precision_flag not in (16, 32): raise MisconfigurationException( f"`Trainer(accelerator='ipu', precision={self._precision_flag!r})` is not supported." ) - if self._accelerator_flag == "tpu" and self._precision_flag == 64: + if isinstance(self.accelerator, TPUAccelerator) and self._precision_flag == 64: raise MisconfigurationException( "`Trainer(accelerator='tpu', precision=64)` is not implemented." " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" " requesting this feature." ) - if self._accelerator_flag == "tpu" and isinstance( - self._precision_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin) + if ( + isinstance(self.accelerator, TPUAccelerator) + and isinstance(self._precision_flag, PrecisionPlugin) + and not isinstance(self._precision_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin)) ): raise ValueError( f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," f" found: {self.strategy.precision_plugin}." ) - if self._precision_flag == 16 and self._accelerator_flag == "cpu" and self._amp_type_flag == AMPType.APEX: - # apex was explicitly passed, not a good idea to silently switch to native AMP + if ( + self._precision_flag == 16 + and isinstance(self.accelerator, CPUAccelerator) + and self._amp_type_flag == AMPType.APEX + ): raise MisconfigurationException( "You passed `Trainer(accelerator='cpu', precision=16, amp_type='apex')`" " but apex AMP not supported on CPU." @@ -661,6 +658,8 @@ def _lazy_init_strategy(self): self.strategy.cluster_environment = self.cluster_environment if hasattr(self.strategy, "parallel_devices"): self.strategy.parallel_devices = self._parallel_devices + if hasattr(self.strategy, "num_nodes"): + self.strategy._num_nodes = self._num_nodes_flag from pytorch_lightning.utilities import _IS_INTERACTIVE @@ -676,6 +675,13 @@ def _lazy_init_strategy(self): ) + if isinstance(self.accelerator, TPUAccelerator) and not isinstance( + self.strategy, (SingleTPUStrategy, TPUSpawnStrategy) + ): + raise ValueError( + "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy`," + f" found {self.strategy}." + ) ############################################################################## # the following logic should be deprecated/removed, and these information should be @@ -700,7 +706,7 @@ def device_type(self): @property def num_nodes(self): - return self._num_nodes + return self._num_nodes_flag @property def num_processes(self): diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index a6b65e9542f0c..e121d27bddd86 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -445,15 +445,17 @@ def test_accelerator_choice_multi_node_gpu( assert isinstance(trainer.strategy, plugin) -@pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't require GPU") -def test_accelerator_cpu(): +@mock.patch("torch.cuda.is_available", return_value=False) +def test_accelerator_cpu(mack_gpu_avalible): trainer = Trainer(accelerator="cpu") assert trainer._device_type == "cpu" assert isinstance(trainer.accelerator, CPUAccelerator) - with pytest.raises(MisconfigurationException, match="You passed `accelerator='gpu'`, but GPUs are not available"): + with pytest.raises(MisconfigurationException): + trainer = Trainer(gpus=1) + with pytest.raises(MisconfigurationException): trainer = Trainer(accelerator="gpu") with pytest.raises(MisconfigurationException, match="You requested GPUs:"): diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 861b149733c0c..a691f4f62d983 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -505,10 +505,7 @@ def test_accelerator_ipu(): assert trainer._device_type == "ipu" assert isinstance(trainer.accelerator, IPUAccelerator) - with pytest.raises( - MisconfigurationException, match="You passed `accelerator='ipu'`, but you didn't pass `ipus` to `Trainer`" - ): - trainer = Trainer(accelerator="ipu") + trainer = Trainer(accelerator="ipu") trainer = Trainer(accelerator="auto", ipus=8) diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index 608d98304c757..bec80ec9ccbc1 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -13,7 +13,7 @@ # limitations under the License import collections from copy import deepcopy -from unittest.mock import Mock, patch +from unittest.mock import patch import pytest import torch @@ -23,7 +23,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.tpu import TPUAccelerator -from pytorch_lightning.plugins import TPUPrecisionPlugin, XLACheckpointIO +from pytorch_lightning.plugins import PrecisionPlugin, TPUPrecisionPlugin, XLACheckpointIO from pytorch_lightning.strategies import DDPStrategy, TPUSpawnStrategy from pytorch_lightning.utilities import find_shared_parameters from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -90,10 +90,7 @@ def test_accelerator_tpu(): assert trainer._device_type == "tpu" assert isinstance(trainer.accelerator, TPUAccelerator) - with pytest.raises( - MisconfigurationException, match="You passed `accelerator='tpu'`, but you didn't pass `tpu_cores` to `Trainer`" - ): - trainer = Trainer(accelerator="tpu") + trainer = Trainer(accelerator="tpu") @RunIf(tpu=True) @@ -290,7 +287,7 @@ def forward(self, x): def test_tpu_invalid_raises(): - training_type_plugin = TPUSpawnStrategy(accelerator=TPUAccelerator(), precision_plugin=Mock()) + training_type_plugin = TPUSpawnStrategy(accelerator=TPUAccelerator(), precision_plugin=PrecisionPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): Trainer(strategy=training_type_plugin) @@ -301,7 +298,7 @@ def test_tpu_invalid_raises(): def test_tpu_invalid_raises_set_precision_with_strategy(): accelerator = TPUAccelerator() - training_type_plugin = TPUSpawnStrategy(accelerator=accelerator, precision_plugin=object()) + training_type_plugin = TPUSpawnStrategy(accelerator=accelerator, precision_plugin=PrecisionPlugin()) with pytest.raises(ValueError, match="`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): Trainer(strategy=training_type_plugin) From 57b16423f6ee7c66c5c7537a0ccc79c9767c8572 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 28 Jan 2022 13:48:44 -0800 Subject: [PATCH 10/69] remove gpu avalible check --- .../connectors/accelerator_connector.py | 26 +++++++++---------- .../test_accelerator_connector.py | 4 +-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 527b64b5625e1..6efc6c336c46c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -154,9 +154,9 @@ def __init__( # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._choose_accelerator() - else: - # # [RFC] move to XAccelerator class init? - self._check_device_availibility() + # else: + # # [RFC] move to XAccelerator class init? + # self._check_device_availibility() self._set_parallel_devices_and_init_accelerator() # --Cluster_environment----------------------------------------------------- @@ -400,16 +400,16 @@ def _choose_accelerator(self): self._accelerator_flag = "cpu" # TODO move this to xAccelerator - def _check_device_availibility(self): - for accelerator_flag, available in zip( - self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, torch.cuda.is_available(), True] - ): - # only apply to gpu to keep backward compatibility - if self._accelerator_flag == accelerator_flag == "gpu": - if not available: - raise MisconfigurationException( - f"You choice {accelerator_flag} accelerator, but {accelerator_flag} is not available" - ) + # def _check_device_availibility(self): + # for accelerator_flag, available in zip( + # self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, torch.cuda.is_available(), True] + # ): + # # only apply to gpu to keep backward compatibility + # if self._accelerator_flag == accelerator_flag == "gpu": + # if not available: + # raise MisconfigurationException( + # f"You choice {accelerator_flag} accelerator, but {accelerator_flag} is not available" + # ) def _set_parallel_devices_and_init_accelerator(self): self._parallel_devices = [] diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index e121d27bddd86..0086992165143 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -455,8 +455,8 @@ def test_accelerator_cpu(mack_gpu_avalible): with pytest.raises(MisconfigurationException): trainer = Trainer(gpus=1) - with pytest.raises(MisconfigurationException): - trainer = Trainer(accelerator="gpu") + # with pytest.raises(MisconfigurationException): + # trainer = Trainer(accelerator="gpu") with pytest.raises(MisconfigurationException, match="You requested GPUs:"): trainer = Trainer(accelerator="cpu", gpus=1) From d374aa9f12b4acbe30540854a911e3c4122f4343 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 28 Jan 2022 15:20:38 -0800 Subject: [PATCH 11/69] update --- .../connectors/accelerator_connector.py | 36 ++++++++++--------- .../test_accelerator_connector.py | 6 ++-- tests/strategies/test_ddp_strategy.py | 7 +++- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 6efc6c336c46c..aa88bd1c20c9a 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -133,6 +133,8 @@ def __init__( """ torch.backends.cudnn.benchmark = benchmark + self.replace_sampler_ddp = replace_sampler_ddp + self.sync_batchnorm = sync_batchnorm # --Parsing_flags------------------------------------------------------ # Get registered strategies, existing accelerators and precision plugins @@ -175,8 +177,6 @@ def __init__( # --Strategy Part 2 : init Strategy and set Strategy properties ------------- self._lazy_init_strategy() - self.replace_sampler_ddp = replace_sampler_ddp - def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): """This method checks: @@ -295,29 +295,29 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl # if user pass in a strategy class which has accelerator, precision, checkpoint or cluster env set up if self._strategy_flag and isinstance(self._strategy_flag, Strategy): - if self._strategy_flag.accelerator: + if self._strategy_flag._accelerator: if self._accelerator_flag: raise MisconfigurationException( "accelerator set through both strategy class and accelerator flag, choose one" ) else: - self._accelerator_flag = self._strategy_flag.accelerator - if self._strategy_flag.precision_plugin: + self._accelerator_flag = self._strategy_flag._accelerator + if self._strategy_flag._precision_plugin: # precision has default value 32, we can not tell whether user set it or not # [RFC] remove default from trainer? # if self._precision_flag: # raise MisconfigurationException("precision set through both strategy class and flags, # choose one place to set") # else: - self._precision_flag = self._strategy_flag.precision_plugin - if self._strategy_flag.checkpoint_io: + self._precision_flag = self._strategy_flag._precision_plugin + if self._strategy_flag._checkpoint_io: if self.checkpoint_io: raise MisconfigurationException( "checkpoint_io set through both strategy class and plugins, choose one" ) else: - self.checkpoint_io = self._strategy_flag.checkpoint_io - if getattr(self._strategy_flag, "cluster_environment", None): + self.checkpoint_io = self._strategy_flag._checkpoint_io + if getattr(self._strategy_flag, "_cluster_environment", None): if self._cluster_environment: raise MisconfigurationException( "cluster_environment set through both strategy class and plugins, choose one" @@ -343,6 +343,9 @@ def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_proce self._device_flag = devices # Delete when remove num_processes, gpus, ipus and tpu_cores self._gpus = gpus + self._tpu_cores = tpu_cores + gpus = device_parser.parse_gpu_ids(gpus) + tpu_cores = device_parser.parse_tpu_cores(tpu_cores) deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( @@ -365,7 +368,7 @@ def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( f"The flag `devices={devices}` will be ignored, " f"instand the device specific number {deprecated_devices_specific_flag} will be used" ) - gpus = int(gpus) if isinstance(gpus, str) and gpus.isnumeric() else gpus + if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( True ) > 1: @@ -447,12 +450,13 @@ def _set_parallel_devices_and_init_accelerator(self): self.accelerator = CPUAccelerator() if self._device_flag == "auto" or not self._device_flag: self._device_flag = CPUAccelerator.auto_device_count() - if not isinstance(self._device_flag, int): - raise MisconfigurationException( + if isinstance(self._device_flag, int): + self._parallel_devices = [torch.device("cpu")] * self._device_flag + else: + rank_zero_warn( "The flag `devices` must be an int with `accelerator='cpu'`," f" got `devices={self._device_flag}` instead." ) - self._parallel_devices = [torch.device("cpu")] * self._device_flag self._gpus = self._device_flag if not self._gpus else self._gpus @@ -549,7 +553,6 @@ def _init_strategy(self): self.strategy = StrategyRegistry.get(self._strategy_flag) else: self.strategy = self._strategy_flag - # print(self.strategy) def _check_capatibility_and_init_precision(self): self._precision_misconfig_check() @@ -625,8 +628,7 @@ def _precision_misconfig_check(self): and not isinstance(self._precision_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin)) ): raise ValueError( - f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," - f" found: {self.strategy.precision_plugin}." + f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," f" found: {self._precision_flag}." ) if ( self._precision_flag == 16 @@ -660,6 +662,8 @@ def _lazy_init_strategy(self): self.strategy.parallel_devices = self._parallel_devices if hasattr(self.strategy, "num_nodes"): self.strategy._num_nodes = self._num_nodes_flag + if hasattr(self.strategy, "sync_batchnorm"): + self.strategy.sync_batchnorm = self.sync_batchnorm from pytorch_lightning.utilities import _IS_INTERACTIVE diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 0086992165143..338b6441944cb 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -575,8 +575,10 @@ def test_set_devices_if_none_gpu(): def test_devices_with_cpu_only_supports_integer(): - with pytest.raises(MisconfigurationException, match="The flag `devices` must be an int"): - Trainer(accelerator="cpu", devices="1,3") + with pytest.warns(UserWarning, match="The flag `devices` must be an int"): + trainer = Trainer(accelerator="cpu", devices="1,3") + assert isinstance(trainer.accelerator, CPUAccelerator) + assert trainer.devices == 1 @pytest.mark.parametrize("training_type", ["ddp2", "dp"]) diff --git a/tests/strategies/test_ddp_strategy.py b/tests/strategies/test_ddp_strategy.py index 157908309f0e6..dddeaed26d98f 100644 --- a/tests/strategies/test_ddp_strategy.py +++ b/tests/strategies/test_ddp_strategy.py @@ -97,7 +97,7 @@ def creates_processes_externally(self): @RunIf(skip_windows=True) -def test_ddp_configure_ddp(): +def test_ddp_configure_ddp_fitting(): """Tests with ddp strategy.""" model = BoringModel() ddp_strategy = DDPStrategy() @@ -115,6 +115,11 @@ def test_ddp_configure_ddp(): # in DDPStrategy configure_ddp(), model wrapped by DistributedDataParallel assert isinstance(trainer.model, DistributedDataParallel) + +@RunIf(skip_windows=True) +def test_ddp_configure_ddp_validating(): + model = BoringModel() + ddp_strategy = DDPStrategy() trainer = Trainer( max_epochs=1, strategy=ddp_strategy, From 0083b6985a59224c60fb028e523c9a5d53fac723 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 28 Jan 2022 16:10:01 -0800 Subject: [PATCH 12/69] fix horovod --- .../connectors/accelerator_connector.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index aa88bd1c20c9a..f3420eb536795 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -170,6 +170,8 @@ def __init__( # Reset strategy even user has specificed one self._strategy_check_and_fallbacks() self._init_strategy() + if _HOROVOD_AVAILABLE and isinstance(self.strategy, HorovodStrategy): + self.handle_horovod # --Precision---------------------------------------------------------------- self.precision_plugin = self._check_capatibility_and_init_precision() @@ -530,7 +532,7 @@ def _strategy_check_and_fallbacks(self): rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) - if "ddp_spawn" in _strategy_flag and ( + if _strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() ): _strategy_flag = "ddp" @@ -554,6 +556,28 @@ def _init_strategy(self): else: self.strategy = self._strategy_flag + def handle_horovod(self): + if self._num_nodes_flag > 1: + raise MisconfigurationException( + "Horovod does not support setting num_nodes / num_gpus explicitly. Use " + "horovodrun / mpirun to configure the number of processes." + ) + + if isinstance(self.strategy, HorovodStrategy) and not _HOROVOD_AVAILABLE: + raise MisconfigurationException( + 'Requested `accelerator="horovod"`, but Horovod is not installed.' + "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]" + ) + + import horovod.torch as hvd + + hvd.init() + if isinstance(self.accelerator, GPUAccelerator): + # Horovod assigns one local GPU per process + self._parallel_device = list(range(hvd.local_size())) + else: + self._parallel_device = hvd.local_size() + def _check_capatibility_and_init_precision(self): self._precision_misconfig_check() if isinstance(self._precision_flag, PrecisionPlugin): From 7a5c3ba9cddbb7e2c1f72e49818b703d2a47a737 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 28 Jan 2022 16:32:17 -0800 Subject: [PATCH 13/69] fix horovod --- .../trainer/connectors/accelerator_connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index f3420eb536795..62da8bf9e3a0b 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -77,6 +77,9 @@ log = logging.getLogger(__name__) +if _HOROVOD_AVAILABLE: + import horovod.torch as hvd + class AcceleratorConnector: def __init__( @@ -569,8 +572,6 @@ def handle_horovod(self): "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]" ) - import horovod.torch as hvd - hvd.init() if isinstance(self.accelerator, GPUAccelerator): # Horovod assigns one local GPU per process From ca96f841682f3fa379c62fdd1a2648b467c852cc Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 28 Jan 2022 16:38:37 -0800 Subject: [PATCH 14/69] debug tpu --- .../trainer/connectors/accelerator_connector.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 62da8bf9e3a0b..1b25a0d3a2dac 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -492,9 +492,6 @@ def _is_slurm_managing_tasks(self): return num_slurm_tasks == total_requested_devices def _choose_strategy(self): - if _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): - self._strategy_flag = HorovodStrategy() - if self._accelerator_flag == "ipu": self._strategy_flag = "ipu" elif self._accelerator_flag == "tpu": @@ -502,6 +499,8 @@ def _choose_strategy(self): self._strategy_flag = "tpu_spawn" else: self._srategy_flag = SingleTPUStrategy(device=self._parallel_devices[0]) + elif _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): + self._strategy_flag = HorovodStrategy() else: if self._num_nodes_flag > 1: self._strategy_flag = "ddp" @@ -556,8 +555,10 @@ def _strategy_check_and_fallbacks(self): def _init_strategy(self): if isinstance(self._strategy_flag, str): self.strategy = StrategyRegistry.get(self._strategy_flag) - else: + elif isinstance(self._strategy_flag, Strategy): self.strategy = self._strategy_flag + else: + raise RuntimeError(f"{self.strategy} is not valid type: {self.strategy}") def handle_horovod(self): if self._num_nodes_flag > 1: From e55a5242b65e4c3b7eb992c8606c8fb352ee987b Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 31 Jan 2022 14:20:53 -0800 Subject: [PATCH 15/69] fix global rank --- pytorch_lightning/strategies/ddp.py | 2 -- pytorch_lightning/strategies/ddp_spawn.py | 2 -- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 ++ 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index fac1cbe2dc288..12376358799fe 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -109,7 +109,6 @@ def __init__( self._pids: Optional[List[int]] = None self._sync_dir: Optional[str] = None self._rank_0_has_called_call_children_scripts: bool = False - self.set_world_ranks() @property def is_distributed(self) -> bool: @@ -127,7 +126,6 @@ def num_nodes(self) -> int: def num_nodes(self, num_nodes: int) -> None: # note that world ranks is related to num_nodes, when resetting it, need to reset world ranks self._num_nodes = num_nodes - self.set_world_ranks() @property def num_processes(self): diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 2e73c64a1b207..70b14bceac845 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -81,7 +81,6 @@ def __init__( self._ddp_comm_hook = ddp_comm_hook self._ddp_comm_wrapper = ddp_comm_wrapper self._local_rank = 0 - self.set_world_ranks() @property def num_nodes(self) -> int: @@ -95,7 +94,6 @@ def num_processes(self): def num_nodes(self, num_nodes: int) -> None: # note that world ranks is related to num_nodes, when resetting it, need to reset world ranks self._num_nodes = num_nodes - self.set_world_ranks() @property def local_rank(self) -> int: diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 1b25a0d3a2dac..951f29f342254 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -690,6 +690,8 @@ def _lazy_init_strategy(self): self.strategy._num_nodes = self._num_nodes_flag if hasattr(self.strategy, "sync_batchnorm"): self.strategy.sync_batchnorm = self.sync_batchnorm + if hasattr(self.strategy, "set_world_ranks"): + self.strategy.set_world_ranks() from pytorch_lightning.utilities import _IS_INTERACTIVE From 9996fea3b2365e37205c13752a6db369c1f86788 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 31 Jan 2022 16:01:39 -0800 Subject: [PATCH 16/69] fix horovod --- .../connectors/accelerator_connector.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 951f29f342254..9e04488e675e2 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -173,8 +173,6 @@ def __init__( # Reset strategy even user has specificed one self._strategy_check_and_fallbacks() self._init_strategy() - if _HOROVOD_AVAILABLE and isinstance(self.strategy, HorovodStrategy): - self.handle_horovod # --Precision---------------------------------------------------------------- self.precision_plugin = self._check_capatibility_and_init_precision() @@ -413,7 +411,7 @@ def _choose_accelerator(self): # self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, torch.cuda.is_available(), True] # ): # # only apply to gpu to keep backward compatibility - # if self._accelerator_flag == accelerator_flag == "gpu": + # if self._accelerator_flag == accelerator_flag: # if not available: # raise MisconfigurationException( # f"You choice {accelerator_flag} accelerator, but {accelerator_flag} is not available" @@ -498,9 +496,10 @@ def _choose_strategy(self): if self._parallel_devices and len(self._parallel_devices) > 1: self._strategy_flag = "tpu_spawn" else: - self._srategy_flag = SingleTPUStrategy(device=self._parallel_devices[0]) + # TODO lazy initialized device, then here could be self._strategy_flag = "single_tpu_device" + self._strategy_flag = SingleTPUStrategy(device=self._parallel_devices[0]) elif _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): - self._strategy_flag = HorovodStrategy() + self._strategy_flag = "horovod" else: if self._num_nodes_flag > 1: self._strategy_flag = "ddp" @@ -510,6 +509,7 @@ def _choose_strategy(self): if self._accelerator_flag == "gpu" else "cpu" ) + # TODO lazy initialized device, then here could be self._strategy_flag = "single_device" self._strategy_flag = SingleDeviceStrategy(device=device) elif len(self._parallel_devices) > 1: self._strategy_flag = "ddp_spawn" @@ -517,7 +517,7 @@ def _choose_strategy(self): self._strategy_flag = "ddp" def _strategy_check_and_fallbacks(self): - # fallback apply to user pass in object as well, so get the _strategy_flag first + # current logic, fallback only apply to user pass in str config not object config _strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if _strategy_flag == "ddp_cpu": @@ -541,25 +541,10 @@ def _strategy_check_and_fallbacks(self): if _strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": rank_zero_warn(f"{_strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") _strategy_flag = "ddp" - # Current test check precision first. So move this test to the end for now. - # TODO update tests and uncomment this part - # if isinstance(self.accelerator, TPUAccelerator) and "tpu" not in _strategy_flag: - # raise ValueError( - # "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy`," - # f" found {_strategy_flag}." - # ) if _strategy_flag: self._strategy_flag = _strategy_flag - def _init_strategy(self): - if isinstance(self._strategy_flag, str): - self.strategy = StrategyRegistry.get(self._strategy_flag) - elif isinstance(self._strategy_flag, Strategy): - self.strategy = self._strategy_flag - else: - raise RuntimeError(f"{self.strategy} is not valid type: {self.strategy}") - def handle_horovod(self): if self._num_nodes_flag > 1: raise MisconfigurationException( @@ -567,7 +552,7 @@ def handle_horovod(self): "horovodrun / mpirun to configure the number of processes." ) - if isinstance(self.strategy, HorovodStrategy) and not _HOROVOD_AVAILABLE: + if not _HOROVOD_AVAILABLE: raise MisconfigurationException( 'Requested `accelerator="horovod"`, but Horovod is not installed.' "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]" @@ -578,7 +563,19 @@ def handle_horovod(self): # Horovod assigns one local GPU per process self._parallel_device = list(range(hvd.local_size())) else: - self._parallel_device = hvd.local_size() + self._parallel_device = [torch.device("cpu")] * hvd.local_size() + + def _init_strategy(self): + if isinstance(self._strategy_flag, HorovodStrategy) or self._strategy_flag == "horovod": + # handle horovod has to happen before initialize strategy because HorovodStrategy needs hvd.init() first. + # TODO lazy initialized and setup horovod strategy `global_rank` + self.handle_horovod() + if isinstance(self._strategy_flag, str): + self.strategy = StrategyRegistry.get(self._strategy_flag) + elif isinstance(self._strategy_flag, Strategy): + self.strategy = self._strategy_flag + else: + raise RuntimeError(f"{self.strategy} is not valid type: {self.strategy}") def _check_capatibility_and_init_precision(self): self._precision_misconfig_check() @@ -706,7 +703,8 @@ def _lazy_init_strategy(self): " creation inside the worker function." ) - + # TODO should be moved to _strategy_check_and_fallbacks(). + # Current test check precision first, so keep this check here to meet error order if isinstance(self.accelerator, TPUAccelerator) and not isinstance( self.strategy, (SingleTPUStrategy, TPUSpawnStrategy) ): From a14879c4898d12bcf86396a47464b71eda767dc3 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Tue, 1 Feb 2022 17:53:30 -0800 Subject: [PATCH 17/69] Update pytorch_lightning/utilities/exceptions.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- pytorch_lightning/utilities/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/exceptions.py b/pytorch_lightning/utilities/exceptions.py index a0de06036792f..548e0cb655945 100644 --- a/pytorch_lightning/utilities/exceptions.py +++ b/pytorch_lightning/utilities/exceptions.py @@ -17,7 +17,7 @@ class MisconfigurationException(Exception): """Exception used to inform users of misuse with PyTorch Lightning.""" -class DeviceNotAvailibleException(Exception): +class DeviceNotAvailableException(Exception): """Exception used to inform users that requested devices are not availible.""" From 1626eee0b9701d618a5e09b984dbf92a3362bfae Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Tue, 1 Feb 2022 17:55:31 -0800 Subject: [PATCH 18/69] update horovod --- .../connectors/accelerator_connector.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 9e04488e675e2..32307f63809d6 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -320,13 +320,21 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl ) else: self.checkpoint_io = self._strategy_flag._checkpoint_io - if getattr(self._strategy_flag, "_cluster_environment", None): + if getattr(self._strategy_flag, "cluster_environment", None): if self._cluster_environment: raise MisconfigurationException( "cluster_environment set through both strategy class and plugins, choose one" ) else: self._cluster_environment = getattr(self._strategy_flag, "cluster_environment") + # RFC existing accel_conn doesn't handle this, should we add conflict check? + # eg: parallel_device is torch.device(cpu) but accelerator=gpu + if hasattr(self._strategy_flag, "parallel_devices"): + if self._strategy_flag.parallel_devices: + if self._strategy_flag.parallel_devices[0].type == "cpu": + self._accelerator_flag = "cpu" + if self._strategy_flag.parallel_devices[0].type == "cuda": + self._accelerator_flag = "gpu" amp_type = amp_type.lower() if isinstance(amp_type, str) else None self._amp_type_flag = AMPType.from_str(amp_type) @@ -561,9 +569,9 @@ def handle_horovod(self): hvd.init() if isinstance(self.accelerator, GPUAccelerator): # Horovod assigns one local GPU per process - self._parallel_device = list(range(hvd.local_size())) + self._parallel_devices = list(range(hvd.local_size())) else: - self._parallel_device = [torch.device("cpu")] * hvd.local_size() + self._parallel_devices = [torch.device("cpu")] * hvd.local_size() def _init_strategy(self): if isinstance(self._strategy_flag, HorovodStrategy) or self._strategy_flag == "horovod": @@ -680,9 +688,13 @@ def _lazy_init_strategy(self): self.strategy.precision_plugin = self.precision_plugin if self.checkpoint_io: self.strategy.checkpoint_io = self.checkpoint_io - self.strategy.cluster_environment = self.cluster_environment + if hasattr(self.strategy, "cluster_environment"): + self.strategy.cluster_environment = self.cluster_environment if hasattr(self.strategy, "parallel_devices"): - self.strategy.parallel_devices = self._parallel_devices + if self.strategy.parallel_devices: + self._parallel_devices = self.strategy.parallel_devices + else: + self.strategy.parallel_devices = self._parallel_devices if hasattr(self.strategy, "num_nodes"): self.strategy._num_nodes = self._num_nodes_flag if hasattr(self.strategy, "sync_batchnorm"): From e13411974ec943f608a0e35780c58b14b3c5966e Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 2 Feb 2022 14:24:24 -0800 Subject: [PATCH 19/69] address some ananth's comments --- .../connectors/accelerator_connector.py | 60 +++++++++---------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 32307f63809d6..3a36a4f0e3ff7 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -194,11 +194,12 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl self._strategy_flag, self._accelerator_flag, self._precision_flag, - self._cluster_environment, + self._precision_plugin_flag, + self._cluster_environment_flag, self.checkpoint_io, self._amp_level_flag, self._amp_type_flag, - ) = (None, None, None, None, None, amp_type, amp_level) + ) = (None, None, None, None, None, None, amp_type, amp_level) if plugins: plugins = [plugins] if not isinstance(plugins, list) else plugins @@ -266,13 +267,6 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl f"Allowed precision values: {PrecisionType.supported_types()}" ) self._precision_flag = precision - # handle duplications and conflict - # [RFC] current logic doesn't handle precision_plugin duplication - # if plugins: - # for plugin in plugins: - # if isinstance(plugin, PrecisionPlugin): - # self._precision_flag = precision - # raise MisconfigurationException("precision set in both precision flag and plugin flag") if plugins: for plugin in plugins: @@ -283,14 +277,14 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={plugin})` instead." ) - elif isinstance(plugin, PrecisionPlugin) or ( - isinstance(plugin, str) and plugin in self._supported_precision - ): + elif isinstance(plugin, PrecisionPlugin): + self._precision_plugin_flag = plugin + elif isinstance(plugin, str) and plugin in self._supported_precision: self._precision_flag = plugin elif isinstance(plugin, CheckpointIO): self.checkpoint_io = plugin elif isinstance(plugin, ClusterEnvironment): - self._cluster_environment = plugin + self._cluster_environment_flag = plugin else: raise MisconfigurationException( f"Found invalid type for plugin {plugin}. Expected a precision or training type plugin." @@ -306,13 +300,11 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl else: self._accelerator_flag = self._strategy_flag._accelerator if self._strategy_flag._precision_plugin: - # precision has default value 32, we can not tell whether user set it or not - # [RFC] remove default from trainer? - # if self._precision_flag: - # raise MisconfigurationException("precision set through both strategy class and flags, - # choose one place to set") - # else: - self._precision_flag = self._strategy_flag._precision_plugin + # [RFC] handle precision plugin set up conflict? + if self._precision_plugin_flag: + raise MisconfigurationException("precision set through both strategy class and plugins, choose one") + else: + self._precision_plugin_flag = self._strategy_flag._precision_plugin if self._strategy_flag._checkpoint_io: if self.checkpoint_io: raise MisconfigurationException( @@ -321,12 +313,12 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl else: self.checkpoint_io = self._strategy_flag._checkpoint_io if getattr(self._strategy_flag, "cluster_environment", None): - if self._cluster_environment: + if self._cluster_environment_flag: raise MisconfigurationException( "cluster_environment set through both strategy class and plugins, choose one" ) else: - self._cluster_environment = getattr(self._strategy_flag, "cluster_environment") + self._cluster_environment_flag = getattr(self._strategy_flag, "cluster_environment") # RFC existing accel_conn doesn't handle this, should we add conflict check? # eg: parallel_device is torch.device(cpu) but accelerator=gpu if hasattr(self._strategy_flag, "parallel_devices"): @@ -451,9 +443,11 @@ def _set_parallel_devices_and_init_accelerator(self): self._device_flag = GPUAccelerator.auto_device_count() if isinstance(self._device_flag, int) or isinstance(self._device_flag, str): self._device_flag = int(self._device_flag) - self._parallel_devices = [ - torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag) - ] + self._parallel_devices = ( + [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] + if self._device_flag != 0 + else [] + ) else: self._parallel_devices = [torch.device("cuda", i) for i in self._device_flag] @@ -473,8 +467,8 @@ def _set_parallel_devices_and_init_accelerator(self): def _choose_and_init_cluster_environment(self): self.cluster_environment = LightningEnvironment() - if isinstance(self._cluster_environment, ClusterEnvironment): - self.cluster_environment = self._cluster_environment + if isinstance(self._cluster_environment_flag, ClusterEnvironment): + self.cluster_environment = self._cluster_environment_flag elif self._is_slurm_managing_tasks(): rank_zero_info("Multiprocessing is handled by SLURM.") self.cluster_environment = SLURMEnvironment() @@ -587,9 +581,8 @@ def _init_strategy(self): def _check_capatibility_and_init_precision(self): self._precision_misconfig_check() - if isinstance(self._precision_flag, PrecisionPlugin): - return self._precision_flag - self.precision_plugin = None + if isinstance(self._precision_plugin_flag, PrecisionPlugin): + return self._precision_plugin_flag if isinstance(self.accelerator, IPUAccelerator): return IPUPrecisionPlugin(self._precision_flag) @@ -655,11 +648,12 @@ def _precision_misconfig_check(self): ) if ( isinstance(self.accelerator, TPUAccelerator) - and isinstance(self._precision_flag, PrecisionPlugin) - and not isinstance(self._precision_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin)) + and self._precision_plugin_flag + and not isinstance(self._precision_plugin_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin)) ): raise ValueError( - f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," f" found: {self._precision_flag}." + f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," + f" found: {self._precision_plugin_flag}." ) if ( self._precision_flag == 16 From 7c1eb85f801df79878c7591b6393acfe313d9fd1 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 2 Feb 2022 15:59:28 -0800 Subject: [PATCH 20/69] draft --- .../trainer/connectors/accelerator_connector.py | 12 +++++++++--- pytorch_lightning/trainer/trainer.py | 2 +- tests/utilities/test_cli.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 3a36a4f0e3ff7..fe705d0cdea6f 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -767,11 +767,17 @@ def devices(self): @property def tpu_cores(self) -> int: - return self.devices + if isinstance(self.accelerator, TPUAccelerator): + return self.devices + else: + return 0 @property - def ipus(self) -> int: - return self.devices + def num_ipus(self) -> int: + if isinstance(self.accelerator, IPUAccelerator): + return self.devices + else: + return 0 @property def num_gpus(self) -> int: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 70f72bd2488cc..e9e89f01675ac 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -636,7 +636,7 @@ def _determine_data_use_amount(self, overfit_batches: float) -> None: self.limit_val_batches = 0 def _setup_on_init(self, num_sanity_val_steps: int) -> None: - # self._log_device_info() + self._log_device_info() self.should_stop = False self.state = TrainerState() diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 5ef2cf98cf3e7..9f62e10cd0fae 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -324,7 +324,7 @@ def test_lightning_cli_args_cluster_environments(tmpdir): class TestModel(BoringModel): def on_fit_start(self): # Ensure SLURMEnvironment is set, instead of default LightningEnvironment - assert isinstance(self.trainer._accelerator_connector._cluster_environment, SLURMEnvironment) + assert isinstance(self.trainer._accelerator_connector.cluster_environment, SLURMEnvironment) self.trainer.ran_asserts = True with mock.patch("sys.argv", ["any.py", "fit", f"--trainer.plugins={json.dumps(plugins)}"]): From 92deb7e5800f0a0b3f19a560ddf3e77520b1be49 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:38:52 +0000 Subject: [PATCH 21/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/strategies/ipu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index c13431d1ad8d8..6a0ef29be0384 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -13,7 +13,7 @@ # limitations under the License. import json import os -from typing import Any, Callable, List, Optional, Union, Dict +from typing import Any, Callable, Dict, List, Optional, Union import torch from torch.utils.data import DataLoader From 8aa1f686e719bfaab4b296a0616e30bd515bc203 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 2 Feb 2022 17:37:14 -0800 Subject: [PATCH 22/69] fix ipus and cli tests --- pytorch_lightning/strategies/ipu.py | 2 +- .../connectors/accelerator_connector.py | 41 ++++++++++--------- tests/utilities/test_cli.py | 5 ++- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index 6a0ef29be0384..d6e1e12a36465 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -62,7 +62,7 @@ def _move_float_tensors_to_half(self, batch: Any) -> Any: class IPUStrategy(ParallelStrategy): """Plugin for training on IPU devices.""" - distributed_backend = "ipu" + distributed_backend = "ipu_strategy" def __init__( self, diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index fe705d0cdea6f..b626990271f06 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -142,7 +142,6 @@ def __init__( # --Parsing_flags------------------------------------------------------ # Get registered strategies, existing accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() - # print(self._existing_strategies_str) self._existing_accelerator_type = ["tpu", "ipu", "gpu", "cpu"] self._supported_precision = PrecisionType.supported_types() @@ -158,7 +157,7 @@ def __init__( # --Accelerator------------------------------------------------------------- # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: - self._choose_accelerator() + self._accelerator_flag = self._choose_accelerator() # else: # # [RFC] move to XAccelerator class init? # self._check_device_availibility() @@ -390,20 +389,20 @@ def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( self._accelerator_flag = "cpu" def _choose_accelerator(self): + if _TPU_AVAILABLE: + return "tpu" + if _IPU_AVAILABLE: + return "ipu" if self._accelerator_flag == "auto": - if _TPU_AVAILABLE: - self._accelerator_flag = "tpu" - elif _IPU_AVAILABLE: - self._accelerator_flag = "ipu" - elif torch.cuda.is_available() and torch.cuda.device_count() > 0: - self._accelerator_flag = "gpu" + if torch.cuda.is_available() and torch.cuda.device_count() > 0: + return "gpu" else: - self._accelerator_flag = "cpu" if self._device_flag == "auto": self._device_flag = 1 + return "cpu" # [RFC] this is current logic, if accelerator not set, default cpu? else: - self._accelerator_flag = "cpu" + return "cpu" # TODO move this to xAccelerator # def _check_device_availibility(self): @@ -492,8 +491,8 @@ def _is_slurm_managing_tasks(self): return num_slurm_tasks == total_requested_devices def _choose_strategy(self): - if self._accelerator_flag == "ipu": - self._strategy_flag = "ipu" + if self._accelerator_flag == "ipu_strategy": + self._strategy_flag = "ipu_strategy" elif self._accelerator_flag == "tpu": if self._parallel_devices and len(self._parallel_devices) > 1: self._strategy_flag = "tpu_spawn" @@ -762,29 +761,31 @@ def devices(self): return 1 elif isinstance(self.strategy, ParallelStrategy): return len(self.strategy.parallel_devices) - else: - return 0 + return 0 @property def tpu_cores(self) -> int: if isinstance(self.accelerator, TPUAccelerator): return self.devices - else: - return 0 + return 0 + + @property + def tpu_id(self) -> Optional[int]: + if isinstance(self.accelerator, TPUAccelerator): + return self.parallel_devices[0] + return None @property def num_ipus(self) -> int: if isinstance(self.accelerator, IPUAccelerator): return self.devices - else: - return 0 + return 0 @property def num_gpus(self) -> int: if isinstance(self.accelerator, GPUAccelerator): return self.devices - else: - return 0 + return 0 # def parallel_device_ids(): @property diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 9f62e10cd0fae..5ba1006c234e9 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -580,7 +580,10 @@ def on_fit_start(self): @pytest.mark.parametrize( "trainer_kwargs", ( - dict(strategy="ddp_spawn"), + # dict(strategy="ddp_spawn") + # !! old accl_conn will choose singleDeviceStrategy for both strategy=ddp/ddp_spawn + # this test never worked with DDPSpawnStrategy + dict(strategy="single_device"), dict(strategy="ddp"), pytest.param({"tpu_cores": 1}, marks=RunIf(tpu=True)), ), From f4cca3c0dc39ec95bf86d47a0c9aaaec269ba125 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 2 Feb 2022 18:28:51 -0800 Subject: [PATCH 23/69] fix typo --- .../trainer/connectors/accelerator_connector.py | 9 +++++---- tests/accelerators/test_ipu.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index b626990271f06..e55a3a09de991 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -463,6 +463,7 @@ def _set_parallel_devices_and_init_accelerator(self): ) self._gpus = self._device_flag if not self._gpus else self._gpus + self._tpu_cores = self._device_flag if not self._tpu_cores else self._tpu_cores def _choose_and_init_cluster_environment(self): self.cluster_environment = LightningEnvironment() @@ -491,7 +492,7 @@ def _is_slurm_managing_tasks(self): return num_slurm_tasks == total_requested_devices def _choose_strategy(self): - if self._accelerator_flag == "ipu_strategy": + if self._accelerator_flag == "ipu": self._strategy_flag = "ipu_strategy" elif self._accelerator_flag == "tpu": if self._parallel_devices and len(self._parallel_devices) > 1: @@ -764,15 +765,15 @@ def devices(self): return 0 @property - def tpu_cores(self) -> int: + def tpu_cores(self): if isinstance(self.accelerator, TPUAccelerator): - return self.devices + return self._tpu_cores return 0 @property def tpu_id(self) -> Optional[int]: if isinstance(self.accelerator, TPUAccelerator): - return self.parallel_devices[0] + return self.tpu_cores[0] return None @property diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index a691f4f62d983..40ceab7195219 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -116,7 +116,7 @@ def test_accelerator_selected(tmpdir): @RunIf(ipu=True) def test_warning_if_ipus_not_used(tmpdir): with pytest.warns(UserWarning, match="IPU available but not used. Set the `ipus` flag in your trainer"): - Trainer(default_root_dir=tmpdir) + Trainer(default_root_dir=tmpdir, accelerator="cpu") @RunIf(ipu=True) From 677c6f12b5e5db0bd0a0d6db2caddeef2fb28774 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 3 Feb 2022 15:40:36 -0800 Subject: [PATCH 24/69] fix tests --- pytorch_lightning/callbacks/gpu_stats_monitor.py | 2 +- pytorch_lightning/strategies/ddp.py | 1 - pytorch_lightning/strategies/ddp2.py | 1 - pytorch_lightning/strategies/ddp_spawn.py | 5 ----- pytorch_lightning/strategies/deepspeed.py | 2 +- pytorch_lightning/strategies/dp.py | 1 - pytorch_lightning/strategies/fully_sharded.py | 2 +- pytorch_lightning/strategies/horovod.py | 1 - pytorch_lightning/strategies/sharded.py | 2 +- pytorch_lightning/strategies/sharded_spawn.py | 1 - .../trainer/connectors/accelerator_connector.py | 5 +++-- tests/accelerators/test_tpu.py | 11 ++++++++--- tests/utilities/test_cli.py | 2 +- 13 files changed, 16 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index 2d10b17acdc95..68d2ef3ba69eb 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -127,7 +127,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O if not trainer.logger: raise MisconfigurationException("Cannot use GPUStatsMonitor callback with Trainer that has no logger.") - if trainer._device_type != _AcceleratorType.GPU: + if trainer._device_type != _AcceleratorType.GPU.lower(): raise MisconfigurationException( "You are using GPUStatsMonitor but are not running on GPU" f" since gpus attribute in Trainer is set to {trainer.gpus}." diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index 12376358799fe..7fd6132fa025a 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -50,7 +50,6 @@ from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available from pytorch_lightning.utilities.distributed import group as _group from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available -from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import DeadlockDetectedException from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed diff --git a/pytorch_lightning/strategies/ddp2.py b/pytorch_lightning/strategies/ddp2.py index ba8e769c35772..ff84a50fa52ba 100644 --- a/pytorch_lightning/strategies/ddp2.py +++ b/pytorch_lightning/strategies/ddp2.py @@ -17,7 +17,6 @@ from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.types import _METRIC_COLLECTION diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 70b14bceac845..1cc926eb2fef8 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -37,7 +37,6 @@ from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available from pytorch_lightning.utilities.distributed import group as _group from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available -from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed @@ -86,10 +85,6 @@ def __init__( def num_nodes(self) -> int: return self._num_nodes - @property - def num_processes(self): - return len(self.parallel_devices) if self.parallel_devices is not None else 0 - @num_nodes.setter def num_nodes(self, num_nodes: int) -> None: # note that world ranks is related to num_nodes, when resetting it, need to reset world ranks diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py index 530ede34ec899..bd6b131574e56 100644 --- a/pytorch_lightning/strategies/deepspeed.py +++ b/pytorch_lightning/strategies/deepspeed.py @@ -35,7 +35,7 @@ from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.distributed import log -from pytorch_lightning.utilities.enums import _StrategyType, AMPType, PrecisionType +from pytorch_lightning.utilities.enums import AMPType, PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities.model_helpers import is_overridden diff --git a/pytorch_lightning/strategies/dp.py b/pytorch_lightning/strategies/dp.py index 01066a21c0e71..a886fcdcbee63 100644 --- a/pytorch_lightning/strategies/dp.py +++ b/pytorch_lightning/strategies/dp.py @@ -22,7 +22,6 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.types import _METRIC_COLLECTION, STEP_OUTPUT diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py index 4a05abd0dd9d8..cd7155cc41170 100644 --- a/pytorch_lightning/strategies/fully_sharded.py +++ b/pytorch_lightning/strategies/fully_sharded.py @@ -23,7 +23,7 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE -from pytorch_lightning.utilities.enums import _StrategyType, PrecisionType +from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import STEP_OUTPUT diff --git a/pytorch_lightning/strategies/horovod.py b/pytorch_lightning/strategies/horovod.py index 1e99dbc429ed8..79e58c164028c 100644 --- a/pytorch_lightning/strategies/horovod.py +++ b/pytorch_lightning/strategies/horovod.py @@ -26,7 +26,6 @@ from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.distributed import group as dist_group from pytorch_lightning.utilities.distributed import ReduceOp -from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/pytorch_lightning/strategies/sharded.py b/pytorch_lightning/strategies/sharded.py index 1f402126b6efe..4efdfb685722f 100644 --- a/pytorch_lightning/strategies/sharded.py +++ b/pytorch_lightning/strategies/sharded.py @@ -22,7 +22,7 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.enums import _StrategyType, PrecisionType +from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/pytorch_lightning/strategies/sharded_spawn.py b/pytorch_lightning/strategies/sharded_spawn.py index 1a7c6b6e00d1c..a6e007a4be31b 100644 --- a/pytorch_lightning/strategies/sharded_spawn.py +++ b/pytorch_lightning/strategies/sharded_spawn.py @@ -21,7 +21,6 @@ import pytorch_lightning as pl from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index e55a3a09de991..93579130c670a 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -596,7 +596,7 @@ def _check_capatibility_and_init_precision(self): f" is not supported with TPUs. Using `precision='bf16'` instead." ) return TPUBf16PrecisionPlugin() - if self._strategy_flag == "deepspeed" or isinstance(self._strategy_flag, DeepSpeedStrategy): + if isinstance(self._strategy_flag, DeepSpeedStrategy): return DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) if self._precision_flag == 32: @@ -773,7 +773,8 @@ def tpu_cores(self): @property def tpu_id(self) -> Optional[int]: if isinstance(self.accelerator, TPUAccelerator): - return self.tpu_cores[0] + if isinstance(self._tpu_cores, list): + return self._tpu_cores[0] return None @property diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index bec80ec9ccbc1..88b926ab382e1 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -228,9 +228,14 @@ def test_ddp_cpu_not_supported_on_tpus(): @RunIf(tpu=True) -@pytest.mark.parametrize("strategy", ["ddp_spawn", "tpu_spawn_debug"]) -def test_strategy_choice_tpu_str(tmpdir, strategy): - trainer = Trainer(strategy=strategy, accelerator="tpu", devices=8) +def test_strategy_choice_tpu_str_ddp_spawn(tmpdir, strategy): + with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): + Trainer(strategy="ddp_spawn", accelerator="tpu", devices=8) + + +@RunIf(tpu=True) +def test_strategy_choice_tpu_str_tpu_spawn_debug(tmpdir, strategy): + trainer = Trainer(strategy="tpu_spawn_debug", accelerator="tpu", devices=8) assert isinstance(trainer.strategy, TPUSpawnStrategy) diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 5ba1006c234e9..8992f0c1accd9 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -581,10 +581,10 @@ def on_fit_start(self): "trainer_kwargs", ( # dict(strategy="ddp_spawn") + # dict(strategy="ddp") # !! old accl_conn will choose singleDeviceStrategy for both strategy=ddp/ddp_spawn # this test never worked with DDPSpawnStrategy dict(strategy="single_device"), - dict(strategy="ddp"), pytest.param({"tpu_cores": 1}, marks=RunIf(tpu=True)), ), ) From 53516214417e8213ce8cbd4e8b0f7e1694917dd7 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 3 Feb 2022 15:48:46 -0800 Subject: [PATCH 25/69] fix pre commit --- pytorch_lightning/strategies/single_device.py | 2 +- pytorch_lightning/strategies/strategy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/strategies/single_device.py b/pytorch_lightning/strategies/single_device.py index adbd3d71371b5..cdbd8eaa2b7a6 100644 --- a/pytorch_lightning/strategies/single_device.py +++ b/pytorch_lightning/strategies/single_device.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import annotations -from typing import Any, Dict +from typing import Any import torch diff --git a/pytorch_lightning/strategies/strategy.py b/pytorch_lightning/strategies/strategy.py index 4b339e0b0efb4..33354dc539201 100644 --- a/pytorch_lightning/strategies/strategy.py +++ b/pytorch_lightning/strategies/strategy.py @@ -440,7 +440,7 @@ def teardown(self) -> None: self.precision_plugin.teardown() @classmethod - def register_strategies(cls, strategies_registry) -> None: + def register_strategies(cls, strategy_registry) -> None: if cls.distributed_backend: strategy_registry.register( cls.distributed_backend, From 836eb98f906776cf797d350597b7024844e82e4a Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 3 Feb 2022 16:21:56 -0800 Subject: [PATCH 26/69] address comments --- pytorch_lightning/strategies/ddp.py | 2 +- pytorch_lightning/strategies/ddp_spawn.py | 2 +- .../connectors/accelerator_connector.py | 24 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index 7fd6132fa025a..3aaa36b01edf2 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -428,7 +428,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.distributed_backend, cls, - description="Strategy", + description=f"{cls.__class__.__name__} Strategy", ) def _should_run_deadlock_detection(self) -> bool: diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 1cc926eb2fef8..75ee8fd4eadb7 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -367,7 +367,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.distributed_backend, cls, - description="Strategy", + description=f"{cls.__class__.__name__} Strategy", ) def teardown(self) -> None: diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 93579130c670a..8578ae05b8be8 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -520,32 +520,32 @@ def _choose_strategy(self): def _strategy_check_and_fallbacks(self): # current logic, fallback only apply to user pass in str config not object config - _strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag + strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag - if _strategy_flag == "ddp_cpu": + if strategy_flag == "ddp_cpu": if _TPU_AVAILABLE: raise MisconfigurationException( "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self._device_flag == 1 and self._num_nodes_flag > 1: - _strategy_flag = "ddp" + strategy_flag = "ddp" else: - _strategy_flag = "ddp_spawn" + strategy_flag = "ddp_spawn" if self._accelerator_flag == "gpu": rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) - if _strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( + if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() ): - _strategy_flag = "ddp" - if _strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": - rank_zero_warn(f"{_strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") - _strategy_flag = "ddp" + strategy_flag = "ddp" + if strategy_flag in ("dp", "ddp2") and self._accelerator_flag == "cpu": + rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") + strategy_flag = "ddp" - if _strategy_flag: - self._strategy_flag = _strategy_flag + if strategy_flag: + self._strategy_flag = strategy_flag def handle_horovod(self): if self._num_nodes_flag > 1: @@ -596,7 +596,7 @@ def _check_capatibility_and_init_precision(self): f" is not supported with TPUs. Using `precision='bf16'` instead." ) return TPUBf16PrecisionPlugin() - if isinstance(self._strategy_flag, DeepSpeedStrategy): + if isinstance(self.strategy, DeepSpeedStrategy): return DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) if self._precision_flag == 32: From 18c4d9ea4ca4a2c87ce96dafc3183a4654778758 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 5 Feb 2022 17:37:46 +0100 Subject: [PATCH 27/69] rename ttp to strategy --- tests/accelerators/test_tpu.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index 88b926ab382e1..2c0b265b0fd16 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -292,27 +292,27 @@ def forward(self, x): def test_tpu_invalid_raises(): - training_type_plugin = TPUSpawnStrategy(accelerator=TPUAccelerator(), precision_plugin=PrecisionPlugin()) + strategy = TPUSpawnStrategy(accelerator=TPUAccelerator(), precision_plugin=PrecisionPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): - Trainer(strategy=training_type_plugin) + Trainer(strategy=strategy) - training_type_plugin = DDPStrategy(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin()) + strategy = DDPStrategy(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): - Trainer(strategy=training_type_plugin) + Trainer(strategy=strategy) def test_tpu_invalid_raises_set_precision_with_strategy(): accelerator = TPUAccelerator() - training_type_plugin = TPUSpawnStrategy(accelerator=accelerator, precision_plugin=PrecisionPlugin()) + strategy = TPUSpawnStrategy(accelerator=accelerator, precision_plugin=PrecisionPlugin()) with pytest.raises(ValueError, match="`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): - Trainer(strategy=training_type_plugin) + Trainer(strategy=strategy) accelerator = TPUAccelerator() - training_type_plugin = DDPStrategy(accelerator=accelerator, precision_plugin=TPUPrecisionPlugin()) + strategy = DDPStrategy(accelerator=accelerator, precision_plugin=TPUPrecisionPlugin()) with pytest.raises( ValueError, match="The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy" ): - Trainer(strategy=training_type_plugin) + Trainer(strategy=strategy) @RunIf(tpu=True) From 0bbc1c4c5a97938c2ca52b6a400f098fe441b8a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 5 Feb 2022 17:37:54 +0100 Subject: [PATCH 28/69] fix typo --- tests/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 338b6441944cb..2df4a8e1b63da 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -903,7 +903,7 @@ def test_unsupported_tpu_choice(monkeypatch): with pytest.raises(MisconfigurationException, match=r"accelerator='tpu', precision=64\)` is not implemented"): Trainer(accelerator="tpu", precision=64) - # if user haven't set strategy, accelerator_connector will choose the TPUSingleStrategy or TPUSpawnStrategy + # if user didn't set strategy, AcceleratorConnector will choose the TPUSingleStrategy or TPUSpawnStrategy with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but native AMP is not supported"): Trainer(accelerator="tpu", precision=16, strategy="ddp") From 2d54316f312c56b2df62d62ce07f130511e55814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 5 Feb 2022 18:21:27 +0100 Subject: [PATCH 29/69] add typing to constructor --- .../connectors/accelerator_connector.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 8578ae05b8be8..932c343347efd 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -34,7 +34,7 @@ PrecisionPlugin, ShardedNativeMixedPrecisionPlugin, TPUBf16PrecisionPlugin, - TPUPrecisionPlugin, + TPUPrecisionPlugin, PLUGIN_INPUT, ) from pytorch_lightning.plugins.environments import ( BaguaEnvironment, @@ -84,25 +84,28 @@ class AcceleratorConnector: def __init__( self, - devices, - num_nodes, - accelerator, # reduce typing - strategy: Optional[Union[str, Strategy]], - plugins, - precision, - amp_type, - amp_level, - sync_batchnorm, - benchmark, - replace_sampler_ddp, - deterministic: bool, - num_processes, # deprecated - tpu_cores, # deprecated - ipus, # deprecated - gpus, # deprecated - gpu_ids, + devices: Optional[Union[List[int], str, int]] = None, + num_nodes: int = 1, + accelerator: Optional[Union[str, Accelerator]] = None, + strategy: Optional[Union[str, Strategy]] = None, + plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]] = None, + precision: Union[int, str] = 32, + amp_type: str = "native", + amp_level: Optional[str] = None, + sync_batchnorm: bool = False, + benchmark: bool = False, + replace_sampler_ddp: bool = True, + deterministic: bool = False, # TODO: why is it unused? + num_processes: int = None, # deprecated + tpu_cores: Optional[Union[List[int], int]] = None, # deprecated + ipus: Optional[int] = None, # deprecated + gpus: Optional[Union[List[int], str, int]] = None, # deprecated + gpu_ids: Optional[List[int]] = None, # TODO: why is it unused? ): - """ + """The AcceleratorConnector parses several Trainer arguments and instantiates the Strategy including other + components such as the Accelerator and Precision plugin. + + A. accelerator flag could be: 1. strategy class (deprecated in 1.5 will be removed in 1.7) 2. strategy str (deprecated in 1.5 will be removed in 1.7) From f7eee0579b28c8ff0c592983809d21ddd5be8289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 5 Feb 2022 18:38:23 +0100 Subject: [PATCH 30/69] update on comments --- .../connectors/accelerator_connector.py | 62 ++++++++++--------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 932c343347efd..dc9097af14a65 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -142,66 +142,67 @@ def __init__( self.replace_sampler_ddp = replace_sampler_ddp self.sync_batchnorm = sync_batchnorm - # --Parsing_flags------------------------------------------------------ - # Get registered strategies, existing accelerators and precision plugins + # 1. Parsing flags + # Get registered strategies, built-in accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() self._existing_accelerator_type = ["tpu", "ipu", "gpu", "cpu"] self._supported_precision = PrecisionType.supported_types() - # raise misconfig exceptions if their is conflict between flags - # set the valid flag to self._x_flag after validation - # for example: if accelerator is strategy class, set self._strategy_flag = accelerator - # for devices: assign gpus ipus and etcs to accelerator_flag and devices_flag + # Raise an exception if there are conflicts between flags + # Set each valid flag to `self._x_flag` after validation + # Example: If accelerator is set to a strategy type, set `self._strategy_flag = accelerator`. + # For devices: Assign gpus, ipus, etc. to the accelerator flag and devices flag self._config_check_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) self._device_config_check_and_set_final_flags( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) - # --Accelerator------------------------------------------------------------- + # 2. Instantiate Accelerator # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._accelerator_flag = self._choose_accelerator() # else: - # # [RFC] move to XAccelerator class init? + # # TODO: [RFC] move to XAccelerator class init? # self._check_device_availibility() self._set_parallel_devices_and_init_accelerator() - # --Cluster_environment----------------------------------------------------- + # 3. Instantiate ClusterEnvironment self._choose_and_init_cluster_environment() - # --Strategy Part 1 : choose strategy and init strategy --------------------------------------- + # 4. Instantiate Strategy - Part 1 if self._strategy_flag is None: self._choose_strategy() - # Reset strategy even user has specificed one + # In specific cases, ignore user selection and fall back to a different strategy self._strategy_check_and_fallbacks() self._init_strategy() - # --Precision---------------------------------------------------------------- + # 5. Instantiate Precision Plugin self.precision_plugin = self._check_capatibility_and_init_precision() - # --Strategy Part 2 : init Strategy and set Strategy properties ------------- + # 6. Instantiate Strategy - Part 2 self._lazy_init_strategy() def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): """This method checks: - 1. strategy flag: strategy, accelerator and plugin can all set strategies - 2. accelerator: if accelerator flag is Accelerator related flag or class, set self._acceelrator_flag; - If accelerator is strategy related, logic handled in 1 above - 3. precision could be set by precision and plugins flag - 4. plugins could be duplicated in strategy (handled by 1), precision (handled by 3), - set checkpoint_io and cluster_environment + 1. strategy: strategy, accelerator and plugin can all be set to strategies + 2. accelerator: if the value of the accelerator argument is a type of accelerator (instance or string), + set self._acceelrator_flag accordingly. If the value is strategy related (instance or string), + it gets handled by 1. + 3. precision: The final value of the precision flag may be determined either by the precision argument or + by a plugin instance. + 4. plugins: a plugin could occur as a value of the strategy argument (handled by 1), or the precision + argument (handled by 3). We also extract the CheckpointIO and ClusterEnvironment plugins. """ - ( - self._strategy_flag, - self._accelerator_flag, - self._precision_flag, - self._precision_plugin_flag, - self._cluster_environment_flag, - self.checkpoint_io, - self._amp_level_flag, - self._amp_type_flag, - ) = (None, None, None, None, None, None, amp_type, amp_level) + self._strategy_flag = None + self._accelerator_flag = None + self._precision_flag = None + self._precision_plugin_flag = None + self._cluster_environment_flag = None + self.checkpoint_io = None + self._amp_level_flag = amp_type + self._amp_type_flag = amp_level + if plugins: plugins = [plugins] if not isinstance(plugins, list) else plugins @@ -292,7 +293,8 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl f"Found invalid type for plugin {plugin}. Expected a precision or training type plugin." ) - # if user pass in a strategy class which has accelerator, precision, checkpoint or cluster env set up + # handle the case when the user passes in a strategy instance which has an accelerator, precision, + # checkpoint io or cluster env set up if self._strategy_flag and isinstance(self._strategy_flag, Strategy): if self._strategy_flag._accelerator: if self._accelerator_flag: From 1022b250803e07b2f3baecc6f75e4b5f203e69c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 5 Feb 2022 23:10:29 +0100 Subject: [PATCH 31/69] typing, documentation improvements, adding todo's --- .../connectors/accelerator_connector.py | 120 +++++++++--------- 1 file changed, 62 insertions(+), 58 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index dc9097af14a65..2db1caf97d044 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -101,7 +101,7 @@ def __init__( ipus: Optional[int] = None, # deprecated gpus: Optional[Union[List[int], str, int]] = None, # deprecated gpu_ids: Optional[List[int]] = None, # TODO: why is it unused? - ): + ) -> None: """The AcceleratorConnector parses several Trainer arguments and instantiates the Strategy including other components such as the Accelerator and Precision plugin. @@ -152,8 +152,8 @@ def __init__( # Set each valid flag to `self._x_flag` after validation # Example: If accelerator is set to a strategy type, set `self._strategy_flag = accelerator`. # For devices: Assign gpus, ipus, etc. to the accelerator flag and devices flag - self._config_check_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) - self._device_config_check_and_set_final_flags( + self._check_config_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) + self._check_device_config_and_set_final_flags( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) @@ -173,16 +173,16 @@ def __init__( if self._strategy_flag is None: self._choose_strategy() # In specific cases, ignore user selection and fall back to a different strategy - self._strategy_check_and_fallbacks() + self._check_strategy_and_fallback() self._init_strategy() # 5. Instantiate Precision Plugin - self.precision_plugin = self._check_capatibility_and_init_precision() + self.precision_plugin = self._check_and_init_precision() # 6. Instantiate Strategy - Part 2 self._lazy_init_strategy() - def _config_check_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level): + def _check_config_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level) -> None: """This method checks: 1. strategy: strategy, accelerator and plugin can all be set to strategies @@ -295,6 +295,7 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl # handle the case when the user passes in a strategy instance which has an accelerator, precision, # checkpoint io or cluster env set up + # TODO: @awaelchli imporve the error messages below if self._strategy_flag and isinstance(self._strategy_flag, Strategy): if self._strategy_flag._accelerator: if self._accelerator_flag: @@ -323,8 +324,9 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl ) else: self._cluster_environment_flag = getattr(self._strategy_flag, "cluster_environment") - # RFC existing accel_conn doesn't handle this, should we add conflict check? - # eg: parallel_device is torch.device(cpu) but accelerator=gpu + + # TODO: RFC existing accel_conn doesn't handle this, should we add conflict check? + # eg: parallel_device is torch.device(cpu) but accelerator=gpu if hasattr(self._strategy_flag, "parallel_devices"): if self._strategy_flag.parallel_devices: if self._strategy_flag.parallel_devices[0].type == "cpu": @@ -341,36 +343,37 @@ def _config_check_and_set_final_flags(self, strategy, accelerator, precision, pl ) self._amp_level_flag = amp_level - def _device_config_check_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores): + def _check_device_config_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores) -> None: if num_nodes == "auto": self._num_nodes_flag = 1 else: self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 self._device_flag = devices - # Delete when remove num_processes, gpus, ipus and tpu_cores + + # TODO: Delete this parsing section when num_processes, gpus, ipus and tpu_cores get removed self._gpus = gpus self._tpu_cores = tpu_cores gpus = device_parser.parse_gpu_ids(gpus) tpu_cores = device_parser.parse_tpu_cores(tpu_cores) deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): - self._mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( + self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores ) - # Delete end - if self._device_flag == "auto": - if self._accelerator_flag is None: - raise MisconfigurationException( - f"You passed `devices={devices}` but haven't specified" - " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" - ) - def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( + if self._device_flag == "auto" and self._accelerator_flag is None: + raise MisconfigurationException( + f"You passed `devices={devices}` but haven't specified" + " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" + ) + + def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores - ): - # set devices base on num_processes, gpus, ipus, tpu_cores + ) -> None: + """Sets the `device_flag` based on num_processes, gpus, ipus, tpu_cores.""" if devices: + # TODO: @awaelchli improve error message rank_zero_warn( f"The flag `devices={devices}` will be ignored, " f"instand the device specific number {deprecated_devices_specific_flag} will be used" @@ -379,11 +382,12 @@ def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( True ) > 1: + # TODO: @awaelchli improve error message rank_zero_warn("more than one device specifc flag has been set") self._device_flag = deprecated_devices_specific_flag if not self._accelerator_flag: - # set accelerator type base on num_processes, gpus, ipus, tpu_cores + # set accelerator type based on num_processes, gpus, ipus, tpu_cores if ipus: self._accelerator_flag = "ipu" if tpu_cores: @@ -393,7 +397,8 @@ def _mapping_deprecated_devices_specfic_info_to_accelerator_and_device_flag( if num_processes: self._accelerator_flag = "cpu" - def _choose_accelerator(self): + def _choose_accelerator(self) -> str: + """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" if _TPU_AVAILABLE: return "tpu" if _IPU_AVAILABLE: @@ -421,7 +426,7 @@ def _choose_accelerator(self): # f"You choice {accelerator_flag} accelerator, but {accelerator_flag} is not available" # ) - def _set_parallel_devices_and_init_accelerator(self): + def _set_parallel_devices_and_init_accelerator(self) -> None: self._parallel_devices = [] if isinstance(self._accelerator_flag, Accelerator): self.accelerator = self._accelerator_flag @@ -470,7 +475,7 @@ def _set_parallel_devices_and_init_accelerator(self): self._gpus = self._device_flag if not self._gpus else self._gpus self._tpu_cores = self._device_flag if not self._tpu_cores else self._tpu_cores - def _choose_and_init_cluster_environment(self): + def _choose_and_init_cluster_environment(self) -> None: self.cluster_environment = LightningEnvironment() if isinstance(self._cluster_environment_flag, ClusterEnvironment): self.cluster_environment = self._cluster_environment_flag @@ -482,7 +487,6 @@ def _choose_and_init_cluster_environment(self): if env_type.detect(): self.cluster_environment = env_type() - @property def _is_sharded_training_type(self) -> bool: return isinstance(self._strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)) @@ -496,14 +500,14 @@ def _is_slurm_managing_tasks(self): num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) return num_slurm_tasks == total_requested_devices - def _choose_strategy(self): + def _choose_strategy(self) -> None: if self._accelerator_flag == "ipu": self._strategy_flag = "ipu_strategy" elif self._accelerator_flag == "tpu": if self._parallel_devices and len(self._parallel_devices) > 1: self._strategy_flag = "tpu_spawn" else: - # TODO lazy initialized device, then here could be self._strategy_flag = "single_tpu_device" + # TODO: lazy initialized device, then here could be self._strategy_flag = "single_tpu_device" self._strategy_flag = SingleTPUStrategy(device=self._parallel_devices[0]) elif _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): self._strategy_flag = "horovod" @@ -516,14 +520,16 @@ def _choose_strategy(self): if self._accelerator_flag == "gpu" else "cpu" ) - # TODO lazy initialized device, then here could be self._strategy_flag = "single_device" + # TODO: lazy initialized device, then here could be self._strategy_flag = "single_device" self._strategy_flag = SingleDeviceStrategy(device=device) elif len(self._parallel_devices) > 1: self._strategy_flag = "ddp_spawn" else: self._strategy_flag = "ddp" - def _strategy_check_and_fallbacks(self): + def _check_strategy_and_fallback(self) -> None: + """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different + choice depending on other parameters or the environment.""" # current logic, fallback only apply to user pass in str config not object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag @@ -552,7 +558,7 @@ def _strategy_check_and_fallbacks(self): if strategy_flag: self._strategy_flag = strategy_flag - def handle_horovod(self): + def _handle_horovod(self) -> None: if self._num_nodes_flag > 1: raise MisconfigurationException( "Horovod does not support setting num_nodes / num_gpus explicitly. Use " @@ -572,11 +578,12 @@ def handle_horovod(self): else: self._parallel_devices = [torch.device("cpu")] * hvd.local_size() - def _init_strategy(self): + def _init_strategy(self) -> None: + """Instantiate the Strategy given depending on the setting of ``_strategy_flag``.""" if isinstance(self._strategy_flag, HorovodStrategy) or self._strategy_flag == "horovod": # handle horovod has to happen before initialize strategy because HorovodStrategy needs hvd.init() first. # TODO lazy initialized and setup horovod strategy `global_rank` - self.handle_horovod() + self._handle_horovod() if isinstance(self._strategy_flag, str): self.strategy = StrategyRegistry.get(self._strategy_flag) elif isinstance(self._strategy_flag, Strategy): @@ -584,8 +591,8 @@ def _init_strategy(self): else: raise RuntimeError(f"{self.strategy} is not valid type: {self.strategy}") - def _check_capatibility_and_init_precision(self): - self._precision_misconfig_check() + def _check_and_init_precision(self) -> PrecisionPlugin: + self._validate_precision_choice() if isinstance(self._precision_plugin_flag, PrecisionPlugin): return self._precision_plugin_flag @@ -638,8 +645,9 @@ def _check_capatibility_and_init_precision(self): raise RuntimeError("No precision set") - def _precision_misconfig_check(self): - # TODO change exception type to ImpactableConfigurationException + def _validate_precision_choice(self) -> None: + """Validate the combination of choices for precision, AMP type, and accelerator.""" + # TODO: change exception type to ImpactableConfigurationException if isinstance(self.accelerator, IPUAccelerator): if self._precision_flag not in (16, 32): raise MisconfigurationException( @@ -681,7 +689,7 @@ def _precision_misconfig_check(self): ) def _lazy_init_strategy(self): - # set strategy properties + """Lazily set missing attributes on the previously instantiated strategy.""" self.strategy.accelerator = self.accelerator if self.precision_plugin: self.strategy.precision_plugin = self.precision_plugin @@ -714,7 +722,7 @@ def _lazy_init_strategy(self): " creation inside the worker function." ) - # TODO should be moved to _strategy_check_and_fallbacks(). + # TODO: should be moved to _check_strategy_and_fallback(). # Current test check precision first, so keep this check here to meet error order if isinstance(self.accelerator, TPUAccelerator) and not isinstance( self.strategy, (SingleTPUStrategy, TPUSpawnStrategy) @@ -724,18 +732,16 @@ def _lazy_init_strategy(self): f" found {self.strategy}." ) - ############################################################################## - # the following logic should be deprecated/removed, and these information should be - # retrive from strategies and accelerators - # Added here to keep backward compabilities + """The following properties are here for backward-compatibility and will be deprecated and removed in favor + of accessing this information through the strategy/accelerator directly.""" + # TODO: deprecate all properties below @property def parallel_devices(self) -> List[Union[torch.device, int]]: return self._parallel_devices - # def _distrib_type(): @property - def device_type(self): + def device_type(self) -> str: if isinstance(self.accelerator, CPUAccelerator): return "cpu" if isinstance(self.accelerator, GPUAccelerator): @@ -746,11 +752,11 @@ def device_type(self): return "ipu" @property - def num_nodes(self): + def num_nodes(self) -> int: return self._num_nodes_flag @property - def num_processes(self): + def num_processes(self) -> int: return self.devices if self.devices is not None else 1 @property @@ -762,7 +768,7 @@ def root_gpu(self) -> Optional[int]: ) @property - def devices(self): + def devices(self) -> int: if isinstance(self.strategy, SingleDeviceStrategy): return 1 elif isinstance(self.strategy, ParallelStrategy): @@ -770,7 +776,7 @@ def devices(self): return 0 @property - def tpu_cores(self): + def tpu_cores(self) -> int: if isinstance(self.accelerator, TPUAccelerator): return self._tpu_cores return 0 @@ -794,18 +800,16 @@ def num_gpus(self) -> int: return self.devices return 0 - # def parallel_device_ids(): @property - def gpus(self): + def gpus(self) -> Optional[Union[List[int], str, int]]: return self._gpus - # if isinstance(self.accelerator, GPUAccelerator) else 0 @property - def parallel_device_ids(self): + def parallel_device_ids(self) -> Optional[List[int]]: return [i for i in range(len(self.parallel_devices))] if isinstance(self.accelerator, GPUAccelerator) else None @property - def is_distributed(self): + def is_distributed(self) -> bool: # Used for custom plugins. # Custom plugins should implement is_distributed property. if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator): @@ -827,19 +831,19 @@ def is_distributed(self): return is_distributed @property - def has_ipu(self): + def has_ipu(self) -> bool: return isinstance(self.accelerator, IPUAccelerator) @property - def use_ipu(self): + def use_ipu(self) -> bool: return self.has_ipu @property - def has_tpu(self): + def has_tpu(self) -> bool: return isinstance(self.accelerator, TPUAccelerator) @property - def use_dp(self): + def use_dp(self) -> bool: return isinstance(self.strategy, DataParallelStrategy) @property From 932e28a31c1ceb90e6d2a9611c08d71a84123ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 6 Feb 2022 01:31:28 +0100 Subject: [PATCH 32/69] fix amp_level, amp_type mixup --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 2db1caf97d044..b09baf06282b8 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -200,8 +200,8 @@ def _check_config_and_set_final_flags(self, strategy, accelerator, precision, pl self._precision_plugin_flag = None self._cluster_environment_flag = None self.checkpoint_io = None - self._amp_level_flag = amp_type - self._amp_type_flag = amp_level + self._amp_level_flag = amp_level + self._amp_type_flag = amp_type if plugins: plugins = [plugins] if not isinstance(plugins, list) else plugins From 3286de3bdd1ffb21d15116862e3d912a5ddaaa61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 6 Feb 2022 02:52:30 +0100 Subject: [PATCH 33/69] more typing fixes --- pyproject.toml | 1 - .../connectors/accelerator_connector.py | 60 +++++++++++++------ 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 15b8391cdbfcf..91e2eaa8b70d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,6 @@ module = [ "pytorch_lightning.profiler.pytorch", "pytorch_lightning.profiler.simple", "pytorch_lightning.trainer.callback_hook", - "pytorch_lightning.trainer.connectors.accelerator_connector", "pytorch_lightning.trainer.connectors.callback_connector", "pytorch_lightning.trainer.connectors.data_connector", "pytorch_lightning.trainer.data_loading", diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index b09baf06282b8..d15f2ade22aec 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -34,7 +34,8 @@ PrecisionPlugin, ShardedNativeMixedPrecisionPlugin, TPUBf16PrecisionPlugin, - TPUPrecisionPlugin, PLUGIN_INPUT, + TPUPrecisionPlugin, + PLUGIN_INPUT, ) from pytorch_lightning.plugins.environments import ( BaguaEnvironment, @@ -96,7 +97,7 @@ def __init__( benchmark: bool = False, replace_sampler_ddp: bool = True, deterministic: bool = False, # TODO: why is it unused? - num_processes: int = None, # deprecated + num_processes: Optional[int] = None, # deprecated tpu_cores: Optional[Union[List[int], int]] = None, # deprecated ipus: Optional[int] = None, # deprecated gpus: Optional[Union[List[int], str, int]] = None, # deprecated @@ -182,17 +183,25 @@ def __init__( # 6. Instantiate Strategy - Part 2 self._lazy_init_strategy() - def _check_config_and_set_final_flags(self, strategy, accelerator, precision, plugins, amp_type, amp_level) -> None: + def _check_config_and_set_final_flags( + self, + strategy: Optional[Union[str, Strategy]], + accelerator: Optional[Union[str, Accelerator]], + precision: Union[int, str], + plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]], + amp_type: str, + amp_level: Optional[str], + ) -> None: """This method checks: - 1. strategy: strategy, accelerator and plugin can all be set to strategies - 2. accelerator: if the value of the accelerator argument is a type of accelerator (instance or string), - set self._acceelrator_flag accordingly. If the value is strategy related (instance or string), - it gets handled by 1. - 3. precision: The final value of the precision flag may be determined either by the precision argument or - by a plugin instance. - 4. plugins: a plugin could occur as a value of the strategy argument (handled by 1), or the precision - argument (handled by 3). We also extract the CheckpointIO and ClusterEnvironment plugins. + 1. strategy: strategy, accelerator and plugin can all be set to strategies + 2. accelerator: if the value of the accelerator argument is a type of accelerator (instance or string), + set self._acceelrator_flag accordingly. If the value is strategy related (instance or string), + it gets handled by 1. + 3. precision: The final value of the precision flag may be determined either by the precision argument or + by a plugin instance. + 4. plugins: a plugin could occur as a value of the strategy argument (handled by 1), or the precision + argument (handled by 3). We also extract the CheckpointIO and ClusterEnvironment plugins. """ self._strategy_flag = None self._accelerator_flag = None @@ -203,7 +212,7 @@ def _check_config_and_set_final_flags(self, strategy, accelerator, precision, pl self._amp_level_flag = amp_level self._amp_type_flag = amp_type - if plugins: + if plugins is not None: plugins = [plugins] if not isinstance(plugins, list) else plugins if strategy: @@ -231,6 +240,7 @@ def _check_config_and_set_final_flags(self, strategy, accelerator, precision, pl raise MisconfigurationException( "strategy str already set through strategy flag, but have also passed in through accelerator" ) + if plugins: for plugin in plugins: if isinstance(plugin, Strategy): @@ -343,7 +353,15 @@ def _check_config_and_set_final_flags(self, strategy, accelerator, precision, pl ) self._amp_level_flag = amp_level - def _check_device_config_and_set_final_flags(self, devices, num_nodes, num_processes, gpus, ipus, tpu_cores) -> None: + def _check_device_config_and_set_final_flags( + self, + devices: Optional[Union[List[int], str, int]], + num_nodes: int, + num_processes: Optional[int], + gpus: Optional[Union[List[int], str, int]], + ipus: Optional[int], + tpu_cores: Optional[Union[List[int], int]], + ) -> None: if num_nodes == "auto": self._num_nodes_flag = 1 else: @@ -369,7 +387,13 @@ def _check_device_config_and_set_final_flags(self, devices, num_nodes, num_proce ) def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( - self, devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores + self, + devices: Optional[Union[List[int], str, int]], + deprecated_devices_specific_flag: Union[int, List[int]], + num_processes: Optional[int], + gpus: Optional[List[int]], + ipus: Optional[int], + tpu_cores: Optional[Union[int, List[int]]], ) -> None: """Sets the `device_flag` based on num_processes, gpus, ipus, tpu_cores.""" if devices: @@ -429,7 +453,7 @@ def _choose_accelerator(self) -> str: def _set_parallel_devices_and_init_accelerator(self) -> None: self._parallel_devices = [] if isinstance(self._accelerator_flag, Accelerator): - self.accelerator = self._accelerator_flag + self.accelerator: Accelerator = self._accelerator_flag elif self._accelerator_flag == "tpu": self.accelerator = TPUAccelerator() if self._device_flag == "auto" or not self._device_flag: @@ -476,7 +500,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._tpu_cores = self._device_flag if not self._tpu_cores else self._tpu_cores def _choose_and_init_cluster_environment(self) -> None: - self.cluster_environment = LightningEnvironment() + self.cluster_environment: ClusterEnvironment = LightningEnvironment() if isinstance(self._cluster_environment_flag, ClusterEnvironment): self.cluster_environment = self._cluster_environment_flag elif self._is_slurm_managing_tasks(): @@ -688,7 +712,7 @@ def _validate_precision_choice(self) -> None: "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`." ) - def _lazy_init_strategy(self): + def _lazy_init_strategy(self) -> None: """Lazily set missing attributes on the previously instantiated strategy.""" self.strategy.accelerator = self.accelerator if self.precision_plugin: @@ -776,7 +800,7 @@ def devices(self) -> int: return 0 @property - def tpu_cores(self) -> int: + def tpu_cores(self) -> Optional[Union[List[int], int]]: if isinstance(self.accelerator, TPUAccelerator): return self._tpu_cores return 0 From 774f35dfaf913a44b3233a05d369e7bcb1eb5b31 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Feb 2022 18:01:18 +0000 Subject: [PATCH 34/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/connectors/accelerator_connector.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index d15f2ade22aec..63a99742f3e39 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -31,11 +31,11 @@ FullyShardedNativeMixedPrecisionPlugin, IPUPrecisionPlugin, NativeMixedPrecisionPlugin, + PLUGIN_INPUT, PrecisionPlugin, ShardedNativeMixedPrecisionPlugin, TPUBf16PrecisionPlugin, TPUPrecisionPlugin, - PLUGIN_INPUT, ) from pytorch_lightning.plugins.environments import ( BaguaEnvironment, @@ -106,7 +106,6 @@ def __init__( """The AcceleratorConnector parses several Trainer arguments and instantiates the Strategy including other components such as the Accelerator and Precision plugin. - A. accelerator flag could be: 1. strategy class (deprecated in 1.5 will be removed in 1.7) 2. strategy str (deprecated in 1.5 will be removed in 1.7) @@ -137,7 +136,6 @@ def __init__( A. Class > str B. Strategy > Accelerator/precision/plugins C. When multiple flag set to the same thing? (ignore? not handled for now) - """ torch.backends.cudnn.benchmark = benchmark self.replace_sampler_ddp = replace_sampler_ddp @@ -552,8 +550,8 @@ def _choose_strategy(self) -> None: self._strategy_flag = "ddp" def _check_strategy_and_fallback(self) -> None: - """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different - choice depending on other parameters or the environment.""" + """Checks edge cases when the strategy selection was a string input, and we need to fall back to a + different choice depending on other parameters or the environment.""" # current logic, fallback only apply to user pass in str config not object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag From 5be85d3599c6214cdfb5dad7bf81f636418fb105 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 7 Feb 2022 10:28:15 -0800 Subject: [PATCH 35/69] Update tests/models/test_gpu.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- tests/models/test_gpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 190936096ddef..c494c0c1c18e6 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -243,7 +243,6 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus trainer = Trainer(gpus=gpus) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) assert trainer._accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) - assert trainer.gpus == gpus From f27d01ced6ec0236218ee13ea55b1d9a8f3c8b38 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 7 Feb 2022 10:39:25 -0800 Subject: [PATCH 36/69] Update pytorch_lightning/trainer/connectors/accelerator_connector.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 63a99742f3e39..94df08d30fb82 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -120,7 +120,7 @@ def __init__( backend (registed these too, and _strategy_type could be deprecated) C. plugins flag could be: - 1. List of str, which could contains: + 1. List of str, which could contain: i. strategy str ii. precision str (Not supported in the old accelerator_connector version) iii. checkpoint_io str (Not supported in the old accelerator_connector version) From 74cbfed88f634e74be71a7ce24e2877d9af4801e Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 7 Feb 2022 10:44:08 -0800 Subject: [PATCH 37/69] Update pytorch_lightning/trainer/connectors/accelerator_connector.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 94df08d30fb82..441d8bdead24f 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -137,6 +137,7 @@ def __init__( B. Strategy > Accelerator/precision/plugins C. When multiple flag set to the same thing? (ignore? not handled for now) """ + # TODO: move to gpu accelerator torch.backends.cudnn.benchmark = benchmark self.replace_sampler_ddp = replace_sampler_ddp self.sync_batchnorm = sync_batchnorm From d54ccfc6dd610a00644097296570e651277ed26b Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 7 Feb 2022 11:24:14 -0800 Subject: [PATCH 38/69] Apply suggestions from code review Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: ananthsub --- .../trainer/connectors/accelerator_connector.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 441d8bdead24f..09a49f2d1e652 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -145,7 +145,7 @@ def __init__( # 1. Parsing flags # Get registered strategies, built-in accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() - self._existing_accelerator_type = ["tpu", "ipu", "gpu", "cpu"] + self._existing_accelerator_type = ("tpu", "ipu", "gpu", "cpu") self._supported_precision = PrecisionType.supported_types() # Raise an exception if there are conflicts between flags @@ -229,7 +229,7 @@ def _check_config_and_set_final_flags( # handle duplications and conflict if isinstance(accelerator, Strategy) and strategy != accelerator: raise MisconfigurationException( - "strategy already set through strategy flag, but have also passed in through accelerator" + f"Incompatible values set in `strategy` and `accelerator` arguments. Received both strategy={strategy} and accelerator={accelerator}" ) if ( isinstance(accelerator, str) @@ -253,7 +253,7 @@ def _check_config_and_set_final_flags( f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) - if accelerator: + if accelerator is not None: if ( accelerator in self._existing_accelerator_type or accelerator == "auto" @@ -399,17 +399,17 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( # TODO: @awaelchli improve error message rank_zero_warn( f"The flag `devices={devices}` will be ignored, " - f"instand the device specific number {deprecated_devices_specific_flag} will be used" + f"instead the device specific number {deprecated_devices_specific_flag} will be used" ) if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( True ) > 1: # TODO: @awaelchli improve error message - rank_zero_warn("more than one device specifc flag has been set") + rank_zero_warn("more than one device specific flag has been set") self._device_flag = deprecated_devices_specific_flag - if not self._accelerator_flag: + if self._accelerator_flag is None: # set accelerator type based on num_processes, gpus, ipus, tpu_cores if ipus: self._accelerator_flag = "ipu" From 653b5b8ba6f6c10cc020135c59d3d8c2d10d38d5 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 7 Feb 2022 14:25:02 -0800 Subject: [PATCH 39/69] support bagua --- pytorch_lightning/strategies/bagua.py | 11 ++++++---- .../connectors/accelerator_connector.py | 21 +++---------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/strategies/bagua.py b/pytorch_lightning/strategies/bagua.py index 3c1520a712ea4..81596b9f058c6 100644 --- a/pytorch_lightning/strategies/bagua.py +++ b/pytorch_lightning/strategies/bagua.py @@ -13,7 +13,6 @@ from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.utilities.distributed import ReduceOp -from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _BAGUA_AVAILABLE from pytorch_lightning.utilities.seed import reset_seed @@ -58,7 +57,7 @@ def __init__(self, pl_module: "pl.LightningModule") -> None: class BaguaStrategy(DDPStrategy): - distributed_backend = _StrategyType.BAGUA + distributed_backend = "bagua" def __init__( self, @@ -180,8 +179,12 @@ def _setup_model(self, model: Module) -> BaguaDistributedDataParallel: ) @classmethod - def register_plugins(cls, plugin_registry: Dict) -> None: - plugin_registry.register("bagua", cls, description="Default Bagua Plugin") + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + cls.distributed_backend, + cls, + description=f"{cls.__class__.__name__} Strategy", + ) def teardown(self) -> None: # abort the background communication for async algorithm diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 09a49f2d1e652..5d1111c851053 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -47,7 +47,6 @@ TorchElasticEnvironment, ) from pytorch_lightning.strategies import ( - BaguaStrategy, DataParallelStrategy, DDP2Strategy, DDPFullyShardedStrategy, @@ -161,9 +160,6 @@ def __init__( # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._accelerator_flag = self._choose_accelerator() - # else: - # # TODO: [RFC] move to XAccelerator class init? - # self._check_device_availibility() self._set_parallel_devices_and_init_accelerator() # 3. Instantiate ClusterEnvironment @@ -229,7 +225,8 @@ def _check_config_and_set_final_flags( # handle duplications and conflict if isinstance(accelerator, Strategy) and strategy != accelerator: raise MisconfigurationException( - f"Incompatible values set in `strategy` and `accelerator` arguments. Received both strategy={strategy} and accelerator={accelerator}" + f"Incompatible values set in `strategy` and `accelerator` arguments." + f"Received both strategy={strategy} and accelerator={accelerator}" ) if ( isinstance(accelerator, str) @@ -437,18 +434,6 @@ def _choose_accelerator(self) -> str: else: return "cpu" - # TODO move this to xAccelerator - # def _check_device_availibility(self): - # for accelerator_flag, available in zip( - # self._existing_accelerator_type, [_TPU_AVAILABLE, _IPU_AVAILABLE, torch.cuda.is_available(), True] - # ): - # # only apply to gpu to keep backward compatibility - # if self._accelerator_flag == accelerator_flag: - # if not available: - # raise MisconfigurationException( - # f"You choice {accelerator_flag} accelerator, but {accelerator_flag} is not available" - # ) - def _set_parallel_devices_and_init_accelerator(self) -> None: self._parallel_devices = [] if isinstance(self._accelerator_flag, Accelerator): @@ -506,7 +491,7 @@ def _choose_and_init_cluster_environment(self) -> None: rank_zero_info("Multiprocessing is handled by SLURM.") self.cluster_environment = SLURMEnvironment() else: - for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): + for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): if env_type.detect(): self.cluster_environment = env_type() From d000e0dd7ecdfec1ee2533a855bdbfe3723ed0dc Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 7 Feb 2022 16:13:44 -0800 Subject: [PATCH 40/69] rename distributed_backend to strategy_name --- pytorch_lightning/strategies/bagua.py | 4 +- pytorch_lightning/strategies/ddp.py | 4 +- pytorch_lightning/strategies/ddp2.py | 4 +- pytorch_lightning/strategies/ddp_spawn.py | 4 +- pytorch_lightning/strategies/deepspeed.py | 2 +- pytorch_lightning/strategies/dp.py | 4 +- pytorch_lightning/strategies/fully_sharded.py | 4 +- pytorch_lightning/strategies/horovod.py | 4 +- pytorch_lightning/strategies/ipu.py | 4 +- pytorch_lightning/strategies/sharded.py | 4 +- pytorch_lightning/strategies/sharded_spawn.py | 4 +- pytorch_lightning/strategies/single_device.py | 4 +- pytorch_lightning/strategies/single_tpu.py | 4 +- pytorch_lightning/strategies/strategy.py | 7 +- .../strategies/strategy_registry.py | 2 +- pytorch_lightning/strategies/tpu_spawn.py | 4 +- .../connectors/accelerator_connector.py | 11 +-- pytorch_lightning/trainer/trainer.py | 2 +- pytorch_lightning/utilities/imports.py | 2 - tests/accelerators/test_tpu.py | 4 +- tests/trainer/test_trainer.py | 74 +++++++++---------- 21 files changed, 73 insertions(+), 83 deletions(-) diff --git a/pytorch_lightning/strategies/bagua.py b/pytorch_lightning/strategies/bagua.py index 81596b9f058c6..672ea800661de 100644 --- a/pytorch_lightning/strategies/bagua.py +++ b/pytorch_lightning/strategies/bagua.py @@ -57,7 +57,7 @@ def __init__(self, pl_module: "pl.LightningModule") -> None: class BaguaStrategy(DDPStrategy): - distributed_backend = "bagua" + strategy_name = "bagua" def __init__( self, @@ -181,7 +181,7 @@ def _setup_model(self, model: Module) -> BaguaDistributedDataParallel: @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index 3aaa36b01edf2..ec9f417ff7799 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -74,7 +74,7 @@ class DDPStrategy(ParallelStrategy): devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes. """ - distributed_backend = "ddp" + strategy_name = "ddp" def __init__( self, @@ -426,7 +426,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: find_unused_parameters=False, ) strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/ddp2.py b/pytorch_lightning/strategies/ddp2.py index ff84a50fa52ba..a6ce8f2f4230e 100644 --- a/pytorch_lightning/strategies/ddp2.py +++ b/pytorch_lightning/strategies/ddp2.py @@ -23,7 +23,7 @@ class DDP2Strategy(DDPStrategy): """DDP2 behaves like DP in one node, but synchronization across nodes behaves like in DDP.""" - distributed_backend = "ddp2" + strategy_name = "ddp2" @property def global_rank(self) -> int: @@ -78,7 +78,7 @@ def set_world_ranks(self) -> None: @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 75ee8fd4eadb7..3c32d9e1872f5 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -52,7 +52,7 @@ class DDPSpawnStrategy(ParallelStrategy): """Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes.""" - distributed_backend = "ddp_spawn" + strategy_name = "ddp_spawn" def __init__( self, @@ -365,7 +365,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: find_unused_parameters=False, ) strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py index bd6b131574e56..cbf66b7040d22 100644 --- a/pytorch_lightning/strategies/deepspeed.py +++ b/pytorch_lightning/strategies/deepspeed.py @@ -82,7 +82,7 @@ def _move_float_tensors_to_half(self, batch: Any): class DeepSpeedStrategy(DDPStrategy): - distributed_backend = "deepspeed" + strategy_name = "deepspeed" DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH" def __init__( diff --git a/pytorch_lightning/strategies/dp.py b/pytorch_lightning/strategies/dp.py index a886fcdcbee63..2aa25f8275dea 100644 --- a/pytorch_lightning/strategies/dp.py +++ b/pytorch_lightning/strategies/dp.py @@ -30,7 +30,7 @@ class DataParallelStrategy(ParallelStrategy): """Implements data-parallel training in a single process, i.e., the model gets replicated to each device and each gets a split of the data.""" - distributed_backend = "dp" + strategy_name = "dp" def __init__( self, @@ -151,7 +151,7 @@ def training_step_end(self, output): @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py index cd7155cc41170..9d0999902a071 100644 --- a/pytorch_lightning/strategies/fully_sharded.py +++ b/pytorch_lightning/strategies/fully_sharded.py @@ -36,7 +36,7 @@ class DDPFullyShardedStrategy(DDPStrategy): - distributed_backend = "ddp_fully_sharded" + strategy_name = "ddp_fully_sharded" def __init__( self, @@ -214,7 +214,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: ) strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/horovod.py b/pytorch_lightning/strategies/horovod.py index 79e58c164028c..49848e0f0163e 100644 --- a/pytorch_lightning/strategies/horovod.py +++ b/pytorch_lightning/strategies/horovod.py @@ -36,7 +36,7 @@ class HorovodStrategy(ParallelStrategy): """Plugin for Horovod distributed training integration.""" - distributed_backend = "horovod" + strategy_name = "horovod" def __init__( self, @@ -198,7 +198,7 @@ def _filter_named_parameters(model: nn.Module, optimizer: Optimizer) -> List[Tup @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index d6e1e12a36465..7252e2bf3f583 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -62,7 +62,7 @@ def _move_float_tensors_to_half(self, batch: Any) -> Any: class IPUStrategy(ParallelStrategy): """Plugin for training on IPU devices.""" - distributed_backend = "ipu_strategy" + strategy_name = "ipu_strategy" def __init__( self, @@ -366,7 +366,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/sharded.py b/pytorch_lightning/strategies/sharded.py index 4efdfb685722f..b39bae1f02369 100644 --- a/pytorch_lightning/strategies/sharded.py +++ b/pytorch_lightning/strategies/sharded.py @@ -37,7 +37,7 @@ class DDPShardedStrategy(DDPStrategy): """Optimizer and gradient sharded training provided by FairScale.""" - distributed_backend = "ddp_sharded" + strategy_name = "ddp_sharded" _REDUCE_BUFFER_SIZE_DEFAULT: int = 2 ** 23 # 8M def configure_ddp(self) -> None: @@ -136,7 +136,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: find_unused_parameters=False, ) strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/sharded_spawn.py b/pytorch_lightning/strategies/sharded_spawn.py index a6e007a4be31b..d4a2629f0862b 100644 --- a/pytorch_lightning/strategies/sharded_spawn.py +++ b/pytorch_lightning/strategies/sharded_spawn.py @@ -35,7 +35,7 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy): """Optimizer sharded training provided by FairScale.""" - distributed_backend = "ddp_sharded_spawn" + strategy_name = "ddp_sharded_spawn" def configure_ddp(self) -> None: self.model, self.optimizers = self._setup_model_and_optimizers( @@ -118,7 +118,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: find_unused_parameters=False, ) strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/single_device.py b/pytorch_lightning/strategies/single_device.py index cdbd8eaa2b7a6..bc17dd08634fd 100644 --- a/pytorch_lightning/strategies/single_device.py +++ b/pytorch_lightning/strategies/single_device.py @@ -27,7 +27,7 @@ class SingleDeviceStrategy(Strategy): """Strategy that handles communication on a single device.""" - distributed_backend = "single_device" + strategy_name = "single_device" def __init__( self, @@ -84,7 +84,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: @classmethod def register_strategies(cls, strategy_registry: dict) -> None: strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/single_tpu.py b/pytorch_lightning/strategies/single_tpu.py index 942f9ebfa9a41..66f90c2cd15f1 100644 --- a/pytorch_lightning/strategies/single_tpu.py +++ b/pytorch_lightning/strategies/single_tpu.py @@ -28,7 +28,7 @@ class SingleTPUStrategy(SingleDeviceStrategy): """Strategy for training on a single TPU device.""" - distributed_backend = "single_tpu" + strategy_name = "single_tpu" def __init__( self, @@ -76,7 +76,7 @@ def model_to_device(self) -> None: @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/strategies/strategy.py b/pytorch_lightning/strategies/strategy.py index 33354dc539201..2106c2bb1ede1 100644 --- a/pytorch_lightning/strategies/strategy.py +++ b/pytorch_lightning/strategies/strategy.py @@ -441,12 +441,7 @@ def teardown(self) -> None: @classmethod def register_strategies(cls, strategy_registry) -> None: - if cls.distributed_backend: - strategy_registry.register( - cls.distributed_backend, - cls, - description=f"{cls.__class__.__name__} Strategy", - ) + pass def on_train_start(self) -> None: """Called when train begins.""" diff --git a/pytorch_lightning/strategies/strategy_registry.py b/pytorch_lightning/strategies/strategy_registry.py index b0d7995053a30..17e08acb23bcc 100644 --- a/pytorch_lightning/strategies/strategy_registry.py +++ b/pytorch_lightning/strategies/strategy_registry.py @@ -75,7 +75,7 @@ def register( def do_register(strategy: Callable) -> Callable: data["strategy"] = strategy - data["distributed_backend"] = strategy.distributed_backend + data["strategy_name"] = strategy.strategy_name self[name] = data return strategy diff --git a/pytorch_lightning/strategies/tpu_spawn.py b/pytorch_lightning/strategies/tpu_spawn.py index 4bcf0d1ef31b6..71db3a64ec466 100644 --- a/pytorch_lightning/strategies/tpu_spawn.py +++ b/pytorch_lightning/strategies/tpu_spawn.py @@ -52,7 +52,7 @@ class TPUSpawnStrategy(DDPSpawnStrategy): """Strategy for training multiple TPU devices using the :func:`torch.multiprocessing.spawn` method.""" - distributed_backend = "tpu_spawn" + strategy_name = "tpu_spawn" def __init__( self, @@ -350,7 +350,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: ) strategy_registry.register( - cls.distributed_backend, + cls.strategy_name, cls, description=f"{cls.__class__.__name__} Strategy", ) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 5d1111c851053..fc348233af732 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -264,9 +264,6 @@ def _check_config_and_set_final_flags( ) self._strategy_flag = accelerator elif accelerator == "ddp_cpu": - rank_zero_warn( - "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." - ) self._strategy_flag = accelerator if precision: @@ -720,10 +717,10 @@ def _lazy_init_strategy(self) -> None: from pytorch_lightning.utilities import _IS_INTERACTIVE interactive_compatible_strategy = ("dp", "ddp_spawn", "ddp_sharded_spawn", "tpu_spawn") - if _IS_INTERACTIVE and self.strategy.distributed_backend not in interactive_compatible_strategy: + if _IS_INTERACTIVE and self.strategy.strategy_name not in interactive_compatible_strategy: raise MisconfigurationException( - f"`Trainer(strategy={self.strategy.distributed_backend!r})` or" - f" `Trainer(accelerator={self.strategy.distributed_backend!r})` is not compatible with an interactive" + f"`Trainer(strategy={self.strategy.strategy_name!r})` or" + f" `Trainer(accelerator={self.strategy.strategy_name!r})` is not compatible with an interactive" " environment. Run your code as a script, or choose one of the compatible backends:" f" {', '.join(interactive_compatible_strategy)}." " In case you are spawning processes yourself, make sure to include the Trainer" @@ -856,4 +853,4 @@ def use_dp(self) -> bool: @property def _strategy_type(self) -> _StrategyType: - return self.strategy.distributed_backend + return self.strategy.strategy_name diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e9e89f01675ac..53ea205115de1 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1970,7 +1970,7 @@ def should_rank_save_checkpoint(self) -> bool: @property def _strategy_type(self) -> _StrategyType: - return self.strategy.distributed_backend + return self.strategy.strategy_name @property def _device_type(self) -> _AcceleratorType: diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 24355097ce34f..6c20d90e01646 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -133,8 +133,6 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: else: _IPU_AVAILABLE = False -_GPU_AVAILABLE = torch.cuda.is_available() and torch.cuda.device_count() > 0 - # experimental feature within PyTorch Lightning. def _fault_tolerant_training() -> bool: diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index 2c0b265b0fd16..dc004f957dac1 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -228,13 +228,13 @@ def test_ddp_cpu_not_supported_on_tpus(): @RunIf(tpu=True) -def test_strategy_choice_tpu_str_ddp_spawn(tmpdir, strategy): +def test_strategy_choice_tpu_str_ddp_spawn(tmpdir): with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): Trainer(strategy="ddp_spawn", accelerator="tpu", devices=8) @RunIf(tpu=True) -def test_strategy_choice_tpu_str_tpu_spawn_debug(tmpdir, strategy): +def test_strategy_choice_tpu_str_tpu_spawn_debug(tmpdir): trainer = Trainer(strategy="tpu_spawn_debug", accelerator="tpu", devices=8) assert isinstance(trainer.strategy, TPUSpawnStrategy) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 32aa94b8e0b2c..0d2d8bbdc55b6 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -47,7 +47,7 @@ DDPStrategy, ) from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _AcceleratorType, _StrategyType +from pytorch_lightning.utilities import _AcceleratorType from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.imports import _IS_WINDOWS, _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_8 @@ -1189,15 +1189,15 @@ def val_dataloader(self): ), ( dict(accelerator="ddp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp", num_nodes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp_cpu", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp2", gpus=None), @@ -1209,43 +1209,43 @@ def val_dataloader(self): ), ( dict(accelerator="dp", gpus=1), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="dp", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(accelerator="ddp", gpus=1), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(accelerator="ddp_cpu", num_processes=2, gpus=1), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="ddp2", gpus=1), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="ddp2", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(accelerator=None, gpus=2), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="dp", gpus=2), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="dp", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="ddp", gpus=2), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="ddp2", gpus=2), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp2", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(accelerator="ddp2", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(accelerator="dp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ], ) @@ -2109,11 +2109,11 @@ def training_step(self, batch, batch_idx): ), ( dict(strategy="ddp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp", num_nodes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp2", gpus=None), @@ -2125,47 +2125,47 @@ def training_step(self, batch, batch_idx): ), ( dict(strategy="dp", gpus=1), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="dp", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy="ddp", gpus=1), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy="ddp_spawn", gpus=1), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy="ddp2", gpus=1), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="ddp2", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy=None, gpus=2), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="dp", gpus=2), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="dp", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="ddp", gpus=2), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="ddp2", gpus=2), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp2", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy="ddp2", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="dp", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp_spawn", num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy="ddp_spawn", num_processes=1, gpus=None), @@ -2173,36 +2173,36 @@ def training_step(self, batch, batch_idx): ), ( dict(strategy="ddp_fully_sharded", gpus=1), - dict(_strategy_type=_StrategyType.DDP_FULLY_SHARDED, _device_type=_AcceleratorType.GPU, num_gpus=1), + dict(_strategy_type="ddp_fully_sharded", _device_type=_AcceleratorType.GPU, num_gpus=1), ), ( dict(strategy=DDPSpawnStrategy(), num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy=DDPSpawnStrategy(), gpus=2), - dict(_strategy_type=_StrategyType.DDP_SPAWN, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp_spawn", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DDPStrategy(), num_processes=2, gpus=None), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.CPU, num_gpus=0), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.CPU, num_gpus=0), ), ( dict(strategy=DDPStrategy(), gpus=2), - dict(_strategy_type=_StrategyType.DDP, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DDP2Strategy(), gpus=2), - dict(_strategy_type=_StrategyType.DDP2, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp2", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DataParallelStrategy(), gpus=2), - dict(_strategy_type=_StrategyType.DP, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="dp", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ( dict(strategy=DDPFullyShardedStrategy(), gpus=2), dict( - _strategy_type=_StrategyType.DDP_FULLY_SHARDED, + _strategy_type="ddp_fully_sharded", _device_type=_AcceleratorType.GPU, num_gpus=2, ), @@ -2210,14 +2210,14 @@ def training_step(self, batch, batch_idx): ( dict(strategy=DDPSpawnShardedStrategy(), gpus=2), dict( - _strategy_type=_StrategyType.DDP_SHARDED_SPAWN, + _strategy_type="ddp_sharded_spawn", _device_type=_AcceleratorType.GPU, num_gpus=2, ), ), ( dict(strategy=DDPShardedStrategy(), gpus=2), - dict(_strategy_type=_StrategyType.DDP_SHARDED, _device_type=_AcceleratorType.GPU, num_gpus=2), + dict(_strategy_type="ddp_sharded", _device_type=_AcceleratorType.GPU, num_gpus=2), ), ], ) From 344a5e6e39c567f861425fee133f6cc9d97549d1 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 7 Feb 2022 16:27:09 -0800 Subject: [PATCH 41/69] distributed_backend to strategy_name in tests/ --- tests/strategies/test_strategy_registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/strategies/test_strategy_registry.py b/tests/strategies/test_strategy_registry.py index ab0629b28b698..89422b3719a29 100644 --- a/tests/strategies/test_strategy_registry.py +++ b/tests/strategies/test_strategy_registry.py @@ -31,7 +31,7 @@ def test_strategy_registry_with_new_strategy(): class TestStrategy: - distributed_backend = "test_strategy" + strategy_name = "test_strategy" def __init__(self, param1, param2): self.param1 = param1 @@ -45,7 +45,7 @@ def __init__(self, param1, param2): assert strategy_name in StrategyRegistry assert StrategyRegistry[strategy_name]["description"] == strategy_description assert StrategyRegistry[strategy_name]["init_params"] == {"param1": "abc", "param2": 123} - assert StrategyRegistry[strategy_name]["distributed_backend"] == "test_strategy" + assert StrategyRegistry[strategy_name]["strategy_name"] == "test_strategy" assert isinstance(StrategyRegistry.get(strategy_name), TestStrategy) StrategyRegistry.remove(strategy_name) From 1707696a936f76d9f53fd7d3b00434e629736e0f Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 10 Feb 2022 14:31:42 +0530 Subject: [PATCH 42/69] Fix tpu tests --- tests/callbacks/test_device_stats_monitor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index c90ce4a4ba96b..51cbf21d1f609 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -123,6 +123,8 @@ def test_device_stats_monitor_no_logger(tmpdir): trainer = Trainer( default_root_dir=tmpdir, + accelerator="cpu", + devices=1, callbacks=[device_stats], max_epochs=1, logger=False, From 05a03d0e95bffb1296ef81a1ca498525ae4d9566 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 9 Feb 2022 18:32:38 -0800 Subject: [PATCH 43/69] draft --- .../trainer/connectors/accelerator_connector.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index fc348233af732..7afde50f32967 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -365,13 +365,14 @@ def _check_device_config_and_set_final_flags( # TODO: Delete this parsing section when num_processes, gpus, ipus and tpu_cores get removed self._gpus = gpus self._tpu_cores = tpu_cores - gpus = device_parser.parse_gpu_ids(gpus) - tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores - if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): - self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( - devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores - ) + if not self._device_flag: + gpus = device_parser.parse_gpu_ids(gpus) + tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores + if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): + self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( + devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores + ) if self._device_flag == "auto" and self._accelerator_flag is None: raise MisconfigurationException( From d4c78f85dc78bd73b6f2cd98a88061f8595bc208 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 9 Feb 2022 19:41:13 -0800 Subject: [PATCH 44/69] add device=0 error message and update tests --- .../connectors/accelerator_connector.py | 18 ++++++++++-------- tests/trainer/flags/test_env_vars.py | 2 +- tests/trainer/test_trainer_cli.py | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7afde50f32967..6d7dd7cb13e04 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -359,20 +359,22 @@ def _check_device_config_and_set_final_flags( self._num_nodes_flag = 1 else: self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 + if devices in (0, "0", "0,"): + print(devices) + raise MisconfigurationException(f"You passed `devices={devices}`, please set a number > 0") self._device_flag = devices # TODO: Delete this parsing section when num_processes, gpus, ipus and tpu_cores get removed self._gpus = gpus self._tpu_cores = tpu_cores - if not self._device_flag: - gpus = device_parser.parse_gpu_ids(gpus) - tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores - if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): - self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( - devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores - ) + gpus = device_parser.parse_gpu_ids(gpus) + tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores + if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): + self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( + devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores + ) if self._device_flag == "auto" and self._accelerator_flag is None: raise MisconfigurationException( diff --git a/tests/trainer/flags/test_env_vars.py b/tests/trainer/flags/test_env_vars.py index bbcc5447d03ce..0e9e6469d67a8 100644 --- a/tests/trainer/flags/test_env_vars.py +++ b/tests/trainer/flags/test_env_vars.py @@ -53,4 +53,4 @@ def test_passing_env_variables_devices(cuda_available_mock, device_count_mock): trainer = Trainer() assert trainer.devices == 2 trainer = Trainer(accelerator="gpu", devices=1) - assert trainer.devices == 1 + assert trainer.devices == 2 diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index 25221f8111f96..330b0f75ffb61 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -163,7 +163,7 @@ def test_argparse_args_parsing_fast_dev_run(cli_args, expected): @pytest.mark.parametrize( ["cli_args", "expected_parsed", "expected_device_ids"], - [("", None, None), ("--accelerator gpu --devices 1", "1", [0]), ("--accelerator gpu --devices 0,", "0,", [0])], + [("", None, None), ("--accelerator gpu --devices 1", "1", [0])], ) @RunIf(min_gpus=1) def test_argparse_args_parsing_devices(cli_args, expected_parsed, expected_device_ids): From 77d2cd1fe6ee73a704b14d0779b98606ffc21cb1 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 9 Feb 2022 21:09:43 -0800 Subject: [PATCH 45/69] fix gpu teests --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 6d7dd7cb13e04..142ed540416e2 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -263,7 +263,7 @@ def _check_config_and_set_final_flags( f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." ) self._strategy_flag = accelerator - elif accelerator == "ddp_cpu": + elif accelerator == "ddp_cpu" and not self._strategy_flag: self._strategy_flag = accelerator if precision: From d8c5ccc65cb9786837e5005cfcccd35a99f50639 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 10 Feb 2022 12:18:07 -0800 Subject: [PATCH 46/69] test revert accelerator auto logic --- .../trainer/connectors/accelerator_connector.py | 10 ++++------ tests/callbacks/test_device_stats_monitor.py | 2 -- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 142ed540416e2..c6a0914fe81f3 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -419,16 +419,14 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( def _choose_accelerator(self) -> str: """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" - if _TPU_AVAILABLE: - return "tpu" - if _IPU_AVAILABLE: - return "ipu" if self._accelerator_flag == "auto": + if _TPU_AVAILABLE: + return "tpu" + if _IPU_AVAILABLE: + return "ipu" if torch.cuda.is_available() and torch.cuda.device_count() > 0: return "gpu" else: - if self._device_flag == "auto": - self._device_flag = 1 return "cpu" # [RFC] this is current logic, if accelerator not set, default cpu? else: diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py index 51cbf21d1f609..c90ce4a4ba96b 100644 --- a/tests/callbacks/test_device_stats_monitor.py +++ b/tests/callbacks/test_device_stats_monitor.py @@ -123,8 +123,6 @@ def test_device_stats_monitor_no_logger(tmpdir): trainer = Trainer( default_root_dir=tmpdir, - accelerator="cpu", - devices=1, callbacks=[device_stats], max_epochs=1, logger=False, From 917039f98ec33968d10e50f503ee76f56870e9b1 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 11 Feb 2022 08:32:55 +0530 Subject: [PATCH 47/69] Tiny update to choose accelerator --- .../trainer/connectors/accelerator_connector.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index c6a0914fe81f3..57b6a078a0277 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -360,7 +360,6 @@ def _check_device_config_and_set_final_flags( else: self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 if devices in (0, "0", "0,"): - print(devices) raise MisconfigurationException(f"You passed `devices={devices}`, please set a number > 0") self._device_flag = devices @@ -426,11 +425,8 @@ def _choose_accelerator(self) -> str: return "ipu" if torch.cuda.is_available() and torch.cuda.device_count() > 0: return "gpu" - else: - return "cpu" # [RFC] this is current logic, if accelerator not set, default cpu? - else: - return "cpu" + return "cpu" def _set_parallel_devices_and_init_accelerator(self) -> None: self._parallel_devices = [] From f2d53fa7fda352ff447fc02bfa04abccd8fb3f89 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 10 Feb 2022 21:18:05 -0800 Subject: [PATCH 48/69] fix ipu and gpu tests --- pytorch_lightning/lite/lite.py | 2 +- .../trainer/connectors/accelerator_connector.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py index 95beb85b1cdad..29b98ecb7e67e 100644 --- a/pytorch_lightning/lite/lite.py +++ b/pytorch_lightning/lite/lite.py @@ -82,7 +82,7 @@ def __init__( self._check_strategy_support(strategy) gpu_ids, tpu_cores = _parse_devices(gpus=gpus, auto_select_gpus=False, tpu_cores=tpu_cores) self._accelerator_connector = AcceleratorConnector( - num_processes=1, + num_processes=None, devices=devices, tpu_cores=tpu_cores, ipus=None, diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 57b6a078a0277..daa9b3616f55c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -56,6 +56,7 @@ DDPStrategy, DeepSpeedStrategy, HorovodStrategy, + IPUStrategy, ParallelStrategy, SingleDeviceStrategy, SingleTPUStrategy, @@ -418,11 +419,11 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( def _choose_accelerator(self) -> str: """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" + if _IPU_AVAILABLE: + return "ipu" if self._accelerator_flag == "auto": if _TPU_AVAILABLE: return "tpu" - if _IPU_AVAILABLE: - return "ipu" if torch.cuda.is_available() and torch.cuda.device_count() > 0: return "gpu" # [RFC] this is current logic, if accelerator not set, default cpu? @@ -834,11 +835,11 @@ def is_distributed(self) -> bool: @property def has_ipu(self) -> bool: - return isinstance(self.accelerator, IPUAccelerator) + return isinstance(self.accelerator, IPUAccelerator) and isinstance(self.strategy, IPUStrategy) @property def use_ipu(self) -> bool: - return self.has_ipu + return isinstance(self.accelerator, IPUAccelerator) @property def has_tpu(self) -> bool: From 266d3f8f4e9177f82ee67db622d598b855025bbf Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 11 Feb 2022 11:21:52 -0800 Subject: [PATCH 49/69] add special handling for ipustrategy --- .../trainer/connectors/accelerator_connector.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index daa9b3616f55c..1aac1c3297ba6 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -159,6 +159,7 @@ def __init__( # 2. Instantiate Accelerator # handle `auto` and `None` + self._special_handle_for_ipu() if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._accelerator_flag = self._choose_accelerator() self._set_parallel_devices_and_init_accelerator() @@ -417,16 +418,22 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( if num_processes: self._accelerator_flag = "cpu" + def _special_handle_for_ipu(self) -> None: + # current logic only apply to object config + # TODO this logic should apply to both str and object config + if isinstance(self._strategy_flag, IPUStrategy): + self._accelerator_flag = "ipu" + def _choose_accelerator(self) -> str: """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" - if _IPU_AVAILABLE: - return "ipu" if self._accelerator_flag == "auto": if _TPU_AVAILABLE: return "tpu" + if _IPU_AVAILABLE: + return "ipu" if torch.cuda.is_available() and torch.cuda.device_count() > 0: return "gpu" - # [RFC] this is current logic, if accelerator not set, default cpu? + # [RFC] this is current logic, if accelerator=None, default cpu? return "cpu" def _set_parallel_devices_and_init_accelerator(self) -> None: @@ -533,7 +540,8 @@ def _choose_strategy(self) -> None: def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a different choice depending on other parameters or the environment.""" - # current logic, fallback only apply to user pass in str config not object config + # current fallback and check logic only apply to user pass in str config and object config + # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag == "ddp_cpu": @@ -714,6 +722,7 @@ def _lazy_init_strategy(self) -> None: from pytorch_lightning.utilities import _IS_INTERACTIVE + # TODO move is_compatible logic to strategy API interactive_compatible_strategy = ("dp", "ddp_spawn", "ddp_sharded_spawn", "tpu_spawn") if _IS_INTERACTIVE and self.strategy.strategy_name not in interactive_compatible_strategy: raise MisconfigurationException( From 0f833f91224c9be011c7451f81f218f467bccbae Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 11 Feb 2022 11:51:06 -0800 Subject: [PATCH 50/69] Address comments --- pytorch_lightning/strategies/bagua.py | 2 +- pytorch_lightning/strategies/ddp.py | 2 +- pytorch_lightning/strategies/ddp2.py | 2 +- pytorch_lightning/strategies/ddp_spawn.py | 2 +- pytorch_lightning/strategies/dp.py | 2 +- pytorch_lightning/strategies/fully_sharded.py | 2 +- pytorch_lightning/strategies/horovod.py | 2 +- pytorch_lightning/strategies/ipu.py | 2 +- pytorch_lightning/strategies/sharded.py | 2 +- pytorch_lightning/strategies/sharded_spawn.py | 2 +- pytorch_lightning/strategies/single_device.py | 2 +- pytorch_lightning/strategies/single_tpu.py | 2 +- pytorch_lightning/strategies/tpu_spawn.py | 2 +- pytorch_lightning/trainer/trainer.py | 2 +- pytorch_lightning/utilities/exceptions.py | 8 -------- tests/strategies/test_ddp_strategy.py | 7 +------ 16 files changed, 15 insertions(+), 28 deletions(-) diff --git a/pytorch_lightning/strategies/bagua.py b/pytorch_lightning/strategies/bagua.py index 672ea800661de..17318331b840d 100644 --- a/pytorch_lightning/strategies/bagua.py +++ b/pytorch_lightning/strategies/bagua.py @@ -183,7 +183,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) def teardown(self) -> None: diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index ec9f417ff7799..010bbf4baa573 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -428,7 +428,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) def _should_run_deadlock_detection(self) -> bool: diff --git a/pytorch_lightning/strategies/ddp2.py b/pytorch_lightning/strategies/ddp2.py index a6ce8f2f4230e..2023316e0e118 100644 --- a/pytorch_lightning/strategies/ddp2.py +++ b/pytorch_lightning/strategies/ddp2.py @@ -80,5 +80,5 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py index 3c32d9e1872f5..a2415c72d5c7c 100644 --- a/pytorch_lightning/strategies/ddp_spawn.py +++ b/pytorch_lightning/strategies/ddp_spawn.py @@ -367,7 +367,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) def teardown(self) -> None: diff --git a/pytorch_lightning/strategies/dp.py b/pytorch_lightning/strategies/dp.py index 2aa25f8275dea..484f7b474b02f 100644 --- a/pytorch_lightning/strategies/dp.py +++ b/pytorch_lightning/strategies/dp.py @@ -153,7 +153,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) def teardown(self) -> None: diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py index 9d0999902a071..af2d6d74bfdd2 100644 --- a/pytorch_lightning/strategies/fully_sharded.py +++ b/pytorch_lightning/strategies/fully_sharded.py @@ -216,5 +216,5 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) diff --git a/pytorch_lightning/strategies/horovod.py b/pytorch_lightning/strategies/horovod.py index 49848e0f0163e..f4a733909651e 100644 --- a/pytorch_lightning/strategies/horovod.py +++ b/pytorch_lightning/strategies/horovod.py @@ -200,7 +200,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) def teardown(self) -> None: diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index 7252e2bf3f583..6f6f4dd92a1f9 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -368,5 +368,5 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) diff --git a/pytorch_lightning/strategies/sharded.py b/pytorch_lightning/strategies/sharded.py index b39bae1f02369..6811721ecaab7 100644 --- a/pytorch_lightning/strategies/sharded.py +++ b/pytorch_lightning/strategies/sharded.py @@ -138,5 +138,5 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) diff --git a/pytorch_lightning/strategies/sharded_spawn.py b/pytorch_lightning/strategies/sharded_spawn.py index d4a2629f0862b..8cb6ca8b62028 100644 --- a/pytorch_lightning/strategies/sharded_spawn.py +++ b/pytorch_lightning/strategies/sharded_spawn.py @@ -120,5 +120,5 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) diff --git a/pytorch_lightning/strategies/single_device.py b/pytorch_lightning/strategies/single_device.py index bc17dd08634fd..da80bad32ad13 100644 --- a/pytorch_lightning/strategies/single_device.py +++ b/pytorch_lightning/strategies/single_device.py @@ -86,7 +86,7 @@ def register_strategies(cls, strategy_registry: dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) def teardown(self) -> None: diff --git a/pytorch_lightning/strategies/single_tpu.py b/pytorch_lightning/strategies/single_tpu.py index 66f90c2cd15f1..757b335e5ae2c 100644 --- a/pytorch_lightning/strategies/single_tpu.py +++ b/pytorch_lightning/strategies/single_tpu.py @@ -78,7 +78,7 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) def teardown(self) -> None: diff --git a/pytorch_lightning/strategies/tpu_spawn.py b/pytorch_lightning/strategies/tpu_spawn.py index 71db3a64ec466..867624fd2151e 100644 --- a/pytorch_lightning/strategies/tpu_spawn.py +++ b/pytorch_lightning/strategies/tpu_spawn.py @@ -352,5 +352,5 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( cls.strategy_name, cls, - description=f"{cls.__class__.__name__} Strategy", + description=f"{cls.__class__.__name__}", ) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 53ea205115de1..2a4c01061c922 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -138,7 +138,7 @@ def __init__( gradient_clip_algorithm: Optional[str] = None, process_position: int = 0, num_nodes: int = 1, - num_processes: int = None, + num_processes: Optional[int] = None, devices: Optional[Union[List[int], str, int]] = None, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, diff --git a/pytorch_lightning/utilities/exceptions.py b/pytorch_lightning/utilities/exceptions.py index 548e0cb655945..ece4629819b33 100644 --- a/pytorch_lightning/utilities/exceptions.py +++ b/pytorch_lightning/utilities/exceptions.py @@ -17,14 +17,6 @@ class MisconfigurationException(Exception): """Exception used to inform users of misuse with PyTorch Lightning.""" -class DeviceNotAvailableException(Exception): - """Exception used to inform users that requested devices are not availible.""" - - -class ImpactableConfigurationException(Exception): - """Exception used to inform users that configuration impactable with each other.""" - - class DeadlockDetectedException(Exception): """Exception used when a deadlock has been detected and processes are being killed.""" diff --git a/tests/strategies/test_ddp_strategy.py b/tests/strategies/test_ddp_strategy.py index dddeaed26d98f..157908309f0e6 100644 --- a/tests/strategies/test_ddp_strategy.py +++ b/tests/strategies/test_ddp_strategy.py @@ -97,7 +97,7 @@ def creates_processes_externally(self): @RunIf(skip_windows=True) -def test_ddp_configure_ddp_fitting(): +def test_ddp_configure_ddp(): """Tests with ddp strategy.""" model = BoringModel() ddp_strategy = DDPStrategy() @@ -115,11 +115,6 @@ def test_ddp_configure_ddp_fitting(): # in DDPStrategy configure_ddp(), model wrapped by DistributedDataParallel assert isinstance(trainer.model, DistributedDataParallel) - -@RunIf(skip_windows=True) -def test_ddp_configure_ddp_validating(): - model = BoringModel() - ddp_strategy = DDPStrategy() trainer = Trainer( max_epochs=1, strategy=ddp_strategy, From 6b434e2dc9c06fb4020c5584c9eb9c742c89c3ef Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Mon, 14 Feb 2022 14:48:49 -0800 Subject: [PATCH 51/69] address comments and add kaushik's suggestions --- pytorch_lightning/strategies/tpu_spawn.py | 2 +- .../connectors/accelerator_connector.py | 196 +++++++++--------- pytorch_lightning/trainer/trainer.py | 2 +- 3 files changed, 99 insertions(+), 101 deletions(-) diff --git a/pytorch_lightning/strategies/tpu_spawn.py b/pytorch_lightning/strategies/tpu_spawn.py index 867624fd2151e..b43267b5c91d6 100644 --- a/pytorch_lightning/strategies/tpu_spawn.py +++ b/pytorch_lightning/strategies/tpu_spawn.py @@ -52,7 +52,7 @@ class TPUSpawnStrategy(DDPSpawnStrategy): """Strategy for training multiple TPU devices using the :func:`torch.multiprocessing.spawn` method.""" - strategy_name = "tpu_spawn" + strategy_name = "tpu_spawn_strategy" def __init__( self, diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 1aac1c3297ba6..500274fd7a4d8 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -72,7 +72,6 @@ rank_zero_info, rank_zero_warn, ) -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE, _IPU_AVAILABLE, _TPU_AVAILABLE @@ -96,15 +95,15 @@ def __init__( sync_batchnorm: bool = False, benchmark: bool = False, replace_sampler_ddp: bool = True, - deterministic: bool = False, # TODO: why is it unused? + deterministic: bool = False, num_processes: Optional[int] = None, # deprecated tpu_cores: Optional[Union[List[int], int]] = None, # deprecated ipus: Optional[int] = None, # deprecated gpus: Optional[Union[List[int], str, int]] = None, # deprecated - gpu_ids: Optional[List[int]] = None, # TODO: why is it unused? + gpu_ids: Optional[List[int]] = None, # TODO can be removed ) -> None: """The AcceleratorConnector parses several Trainer arguments and instantiates the Strategy including other - components such as the Accelerator and Precision plugin. + components such as the Accelerator and Precision plugins. A. accelerator flag could be: 1. strategy class (deprecated in 1.5 will be removed in 1.7) @@ -115,7 +114,7 @@ def __init__( B. strategy flag could be : 1. strategy class - 2. strategy str registered with strategyRegister + 2. strategy str registered with StrategyRegistry 3. strategy str in _strategy_type enum which listed in each strategy as backend (registed these too, and _strategy_type could be deprecated) @@ -146,7 +145,7 @@ def __init__( # Get registered strategies, built-in accelerators and precision plugins self._existing_strategies_str = StrategyRegistry.available_strategies() self._existing_accelerator_type = ("tpu", "ipu", "gpu", "cpu") - self._supported_precision = PrecisionType.supported_types() + self._supported_precision_type = ("16", "32", "64", "bf16", "mixed") # Raise an exception if there are conflicts between flags # Set each valid flag to `self._x_flag` after validation @@ -165,11 +164,11 @@ def __init__( self._set_parallel_devices_and_init_accelerator() # 3. Instantiate ClusterEnvironment - self._choose_and_init_cluster_environment() + self.cluster_environment = self._choose_and_init_cluster_environment() # 4. Instantiate Strategy - Part 1 if self._strategy_flag is None: - self._choose_strategy() + self._strategy_flag = self._choose_strategy() # In specific cases, ignore user selection and fall back to a different strategy self._check_strategy_and_fallback() self._init_strategy() @@ -193,7 +192,7 @@ def _check_config_and_set_final_flags( 1. strategy: strategy, accelerator and plugin can all be set to strategies 2. accelerator: if the value of the accelerator argument is a type of accelerator (instance or string), - set self._acceelrator_flag accordingly. If the value is strategy related (instance or string), + set self.accelerator_flag accordingly. If the value is strategy related (instance or string), it gets handled by 1. 3. precision: The final value of the precision flag may be determined either by the precision argument or by a plugin instance. @@ -212,7 +211,7 @@ def _check_config_and_set_final_flags( if plugins is not None: plugins = [plugins] if not isinstance(plugins, list) else plugins - if strategy: + if strategy is not None: self._strategy_flag = strategy if strategy == "ddp_cpu": raise MisconfigurationException( @@ -238,7 +237,6 @@ def _check_config_and_set_final_flags( raise MisconfigurationException( "strategy str already set through strategy flag, but have also passed in through accelerator" ) - if plugins: for plugin in plugins: if isinstance(plugin, Strategy): @@ -268,11 +266,11 @@ def _check_config_and_set_final_flags( elif accelerator == "ddp_cpu" and not self._strategy_flag: self._strategy_flag = accelerator - if precision: - if not PrecisionType.supported_type(precision): + if precision is not None: + if str(precision) not in self._supported_precision_type: raise MisconfigurationException( f"Precision {repr(precision)} is invalid. " - f"Allowed precision values: {PrecisionType.supported_types()}" + f"Allowed precision values: {self._supported_precision_type}" ) self._precision_flag = precision @@ -287,7 +285,7 @@ def _check_config_and_set_final_flags( elif isinstance(plugin, PrecisionPlugin): self._precision_plugin_flag = plugin - elif isinstance(plugin, str) and plugin in self._supported_precision: + elif isinstance(plugin, str) and plugin in self._supported_precision_type: self._precision_flag = plugin elif isinstance(plugin, CheckpointIO): self.checkpoint_io = plugin @@ -339,7 +337,7 @@ def _check_config_and_set_final_flags( if self._strategy_flag.parallel_devices[0].type == "cuda": self._accelerator_flag = "gpu" - amp_type = amp_type.lower() if isinstance(amp_type, str) else None + amp_type = amp_type if isinstance(amp_type, str) else None self._amp_type_flag = AMPType.from_str(amp_type) if amp_level is not None and self._amp_type_flag != AMPType.APEX: @@ -366,16 +364,10 @@ def _check_device_config_and_set_final_flags( self._device_flag = devices - # TODO: Delete this parsing section when num_processes, gpus, ipus and tpu_cores get removed - self._gpus = gpus - self._tpu_cores = tpu_cores - gpus = device_parser.parse_gpu_ids(gpus) - tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores - if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): - self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( - devices, deprecated_devices_specific_flag, num_processes, gpus, ipus, tpu_cores - ) + # TODO: Delete this method num_processes, gpus, ipus and tpu_cores get removed + self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( + devices, num_processes, gpus, ipus, tpu_cores + ) if self._device_flag == "auto" and self._accelerator_flag is None: raise MisconfigurationException( @@ -386,37 +378,42 @@ def _check_device_config_and_set_final_flags( def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( self, devices: Optional[Union[List[int], str, int]], - deprecated_devices_specific_flag: Union[int, List[int]], num_processes: Optional[int], gpus: Optional[List[int]], ipus: Optional[int], tpu_cores: Optional[Union[int, List[int]]], ) -> None: - """Sets the `device_flag` based on num_processes, gpus, ipus, tpu_cores.""" - if devices: - # TODO: @awaelchli improve error message - rank_zero_warn( - f"The flag `devices={devices}` will be ignored, " - f"instead the device specific number {deprecated_devices_specific_flag} will be used" - ) + """Sets the `device_flag` and `accelerator_flag `based on num_processes, gpus, ipus, tpu_cores.""" + self._gpus = gpus + self._tpu_cores = tpu_cores + gpus = device_parser.parse_gpu_ids(gpus) + tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores + if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in (0, "0"): + if devices: + # TODO: @awaelchli improve error message + rank_zero_warn( + f"The flag `devices={devices}` will be ignored, " + f"instead the device specific number {deprecated_devices_specific_flag} will be used" + ) - if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( - True - ) > 1: - # TODO: @awaelchli improve error message - rank_zero_warn("more than one device specific flag has been set") - self._device_flag = deprecated_devices_specific_flag - - if self._accelerator_flag is None: - # set accelerator type based on num_processes, gpus, ipus, tpu_cores - if ipus: - self._accelerator_flag = "ipu" - if tpu_cores: - self._accelerator_flag = "tpu" - if gpus: - self._accelerator_flag = "gpu" - if num_processes: - self._accelerator_flag = "cpu" + if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( + True + ) > 1: + # TODO: @awaelchli improve error message + rank_zero_warn("more than one device specific flag has been set") + self._device_flag = deprecated_devices_specific_flag + + if self._accelerator_flag is None: + # set accelerator type based on num_processes, gpus, ipus, tpu_cores + if ipus: + self._accelerator_flag = "ipu" + if tpu_cores: + self._accelerator_flag = "tpu" + if gpus: + self._accelerator_flag = "gpu" + if num_processes: + self._accelerator_flag = "cpu" def _special_handle_for_ipu(self) -> None: # current logic only apply to object config @@ -486,16 +483,15 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._tpu_cores = self._device_flag if not self._tpu_cores else self._tpu_cores def _choose_and_init_cluster_environment(self) -> None: - self.cluster_environment: ClusterEnvironment = LightningEnvironment() if isinstance(self._cluster_environment_flag, ClusterEnvironment): - self.cluster_environment = self._cluster_environment_flag - elif self._is_slurm_managing_tasks(): + return self._cluster_environment_flag + if self._is_slurm_managing_tasks(): rank_zero_info("Multiprocessing is handled by SLURM.") - self.cluster_environment = SLURMEnvironment() - else: - for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): - if env_type.detect(): - self.cluster_environment = env_type() + return SLURMEnvironment() + for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): + if env_type.detect(): + return env_type() + return LightningEnvironment() @property def _is_sharded_training_type(self) -> bool: @@ -510,32 +506,31 @@ def _is_slurm_managing_tasks(self): num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) return num_slurm_tasks == total_requested_devices - def _choose_strategy(self) -> None: + def _choose_strategy(self) -> str: if self._accelerator_flag == "ipu": - self._strategy_flag = "ipu_strategy" - elif self._accelerator_flag == "tpu": + return IPUStrategy.strategy_name + if self._accelerator_flag == "tpu": if self._parallel_devices and len(self._parallel_devices) > 1: - self._strategy_flag = "tpu_spawn" + return TPUSpawnStrategy.strategy_name else: # TODO: lazy initialized device, then here could be self._strategy_flag = "single_tpu_device" - self._strategy_flag = SingleTPUStrategy(device=self._parallel_devices[0]) - elif _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): - self._strategy_flag = "horovod" - else: - if self._num_nodes_flag > 1: - self._strategy_flag = "ddp" - elif len(self._parallel_devices) <= 1: - device = ( - device_parser.determine_root_gpu_device(self._parallel_devices) - if self._accelerator_flag == "gpu" - else "cpu" - ) - # TODO: lazy initialized device, then here could be self._strategy_flag = "single_device" - self._strategy_flag = SingleDeviceStrategy(device=device) - elif len(self._parallel_devices) > 1: - self._strategy_flag = "ddp_spawn" - else: - self._strategy_flag = "ddp" + return SingleTPUStrategy(device=self._parallel_devices[0]) + if _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): + return HorovodStrategy.strategy_name + if self._num_nodes_flag > 1: + return DDPStrategy.strategy_name + if len(self._parallel_devices) <= 1: + device = ( + device_parser.determine_root_gpu_device(self._parallel_devices) + if self._accelerator_flag == "gpu" + else "cpu" + ) + # TODO: lazy initialized device, then here could be self._strategy_flag = "single_device" + return SingleDeviceStrategy(device=device) + if len(self._parallel_devices) > 1: + return DDPSpawnStrategy.strategy_name + + return DDPStrategy.strategy_name def _check_strategy_and_fallback(self) -> None: """Checks edge cases when the strategy selection was a string input, and we need to fall back to a @@ -551,7 +546,7 @@ def _check_strategy_and_fallback(self) -> None: "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self._device_flag == 1 and self._num_nodes_flag > 1: - strategy_flag = "ddp" + strategy_flag = DDPStrategy.strategy_name else: strategy_flag = "ddp_spawn" if self._accelerator_flag == "gpu": @@ -651,7 +646,6 @@ def _check_and_init_precision(self) -> PrecisionPlugin: return NativeMixedPrecisionPlugin(self._precision_flag, device) if self._amp_type_flag == AMPType.APEX: - self._amp_level_flag = self._amp_level_flag or "O2" return ApexMixedPrecisionPlugin(self._amp_level_flag) raise RuntimeError("No precision set") @@ -664,21 +658,20 @@ def _validate_precision_choice(self) -> None: raise MisconfigurationException( f"`Trainer(accelerator='ipu', precision={self._precision_flag!r})` is not supported." ) - if isinstance(self.accelerator, TPUAccelerator) and self._precision_flag == 64: - raise MisconfigurationException( - "`Trainer(accelerator='tpu', precision=64)` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" - " requesting this feature." - ) - if ( - isinstance(self.accelerator, TPUAccelerator) - and self._precision_plugin_flag - and not isinstance(self._precision_plugin_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin)) - ): - raise ValueError( - f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," - f" found: {self._precision_plugin_flag}." - ) + if isinstance(self.accelerator, TPUAccelerator): + if self._precision_flag == 64: + raise MisconfigurationException( + "`Trainer(accelerator='tpu', precision=64)` is not implemented." + " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " requesting this feature." + ) + if self._precision_plugin_flag and not isinstance( + self._precision_plugin_flag, (TPUPrecisionPlugin, TPUBf16PrecisionPlugin) + ): + raise ValueError( + f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," + f" found: {self._precision_plugin_flag}." + ) if ( self._precision_flag == 16 and isinstance(self.accelerator, CPUAccelerator) @@ -723,7 +716,12 @@ def _lazy_init_strategy(self) -> None: from pytorch_lightning.utilities import _IS_INTERACTIVE # TODO move is_compatible logic to strategy API - interactive_compatible_strategy = ("dp", "ddp_spawn", "ddp_sharded_spawn", "tpu_spawn") + interactive_compatible_strategy = ( + DataParallelStrategy.strategy_name, + DDPSpawnStrategy.strategy_name, + DDPSpawnShardedStrategy.strategy_name, + TPUSpawnStrategy.strategy_name, + ) if _IS_INTERACTIVE and self.strategy.strategy_name not in interactive_compatible_strategy: raise MisconfigurationException( f"`Trainer(strategy={self.strategy.strategy_name!r})` or" diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2a4c01061c922..b4fec24b0f101 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1969,7 +1969,7 @@ def should_rank_save_checkpoint(self) -> bool: ) @property - def _strategy_type(self) -> _StrategyType: + def _strategy_type(self) -> Optional[int]: return self.strategy.strategy_name @property From 3560c555c6e2580316a222d5248d03df5aaeb131 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 14 Feb 2022 14:52:35 -0800 Subject: [PATCH 52/69] Apply suggestions from code review Co-authored-by: ananthsub --- pytorch_lightning/callbacks/gpu_stats_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index 68d2ef3ba69eb..f5348b779a803 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -127,7 +127,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O if not trainer.logger: raise MisconfigurationException("Cannot use GPUStatsMonitor callback with Trainer that has no logger.") - if trainer._device_type != _AcceleratorType.GPU.lower(): + if trainer.strategy.root_device.type != "cuda": raise MisconfigurationException( "You are using GPUStatsMonitor but are not running on GPU" f" since gpus attribute in Trainer is set to {trainer.gpus}." From 55547bcbfb85b70658473b3cf68c0cf65eaad051 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Tue, 15 Feb 2022 18:34:23 -0800 Subject: [PATCH 53/69] fix mypy --- .../callbacks/gpu_stats_monitor.py | 1 - .../connectors/accelerator_connector.py | 81 ++++++++++--------- pytorch_lightning/utilities/device_parser.py | 2 +- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index f5348b779a803..a871bfa309c96 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -29,7 +29,6 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.utilities import _AcceleratorType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_only diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 500274fd7a4d8..dbfd885235e42 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -68,6 +68,7 @@ _StrategyType, AMPType, device_parser, + LightningEnum, rank_zero_deprecation, rank_zero_info, rank_zero_warn, @@ -134,7 +135,7 @@ def __init__( priorities which to take when: A. Class > str B. Strategy > Accelerator/precision/plugins - C. When multiple flag set to the same thing? (ignore? not handled for now) + C. TODO When multiple flag set to the same thing """ # TODO: move to gpu accelerator torch.backends.cudnn.benchmark = benchmark @@ -151,7 +152,23 @@ def __init__( # Set each valid flag to `self._x_flag` after validation # Example: If accelerator is set to a strategy type, set `self._strategy_flag = accelerator`. # For devices: Assign gpus, ipus, etc. to the accelerator flag and devices flag - self._check_config_and_set_final_flags(strategy, accelerator, precision, plugins, amp_type, amp_level) + self._strategy_flag: Optional[Union[Strategy, str]] = None + self._accelerator_flag: Optional[Union[Accelerator, str]] = None + self._precision_flag: Optional[Union[int, str]] = None + self._precision_plugin_flag: Optional[PrecisionPlugin] = None + self._cluster_environment_flag: Optional[Union[ClusterEnvironment, str]] = None + self.checkpoint_io: Optional[CheckpointIO] = None + self._amp_type_flag: Optional[LightningEnum] = None + self._amp_level_flag: Optional[str] = amp_level + + self._check_config_and_set_final_flags( + strategy=strategy, + accelerator=accelerator, + precision=precision, + plugins=plugins, + amp_type=amp_type, + amp_level=amp_level, + ) self._check_device_config_and_set_final_flags( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) @@ -199,15 +216,6 @@ def _check_config_and_set_final_flags( 4. plugins: a plugin could occur as a value of the strategy argument (handled by 1), or the precision argument (handled by 3). We also extract the CheckpointIO and ClusterEnvironment plugins. """ - self._strategy_flag = None - self._accelerator_flag = None - self._precision_flag = None - self._precision_plugin_flag = None - self._cluster_environment_flag = None - self.checkpoint_io = None - self._amp_level_flag = amp_level - self._amp_type_flag = amp_type - if plugins is not None: plugins = [plugins] if not isinstance(plugins, list) else plugins @@ -344,7 +352,6 @@ def _check_config_and_set_final_flags( raise MisconfigurationException( f"You have asked for `amp_level={amp_level!r}` but it's only supported with `amp_backend='apex'`." ) - self._amp_level_flag = amp_level def _check_device_config_and_set_final_flags( self, @@ -355,10 +362,8 @@ def _check_device_config_and_set_final_flags( ipus: Optional[int], tpu_cores: Optional[Union[List[int], int]], ) -> None: - if num_nodes == "auto": - self._num_nodes_flag = 1 - else: - self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 + self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 + if devices in (0, "0", "0,"): raise MisconfigurationException(f"You passed `devices={devices}`, please set a number > 0") @@ -379,13 +384,13 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( self, devices: Optional[Union[List[int], str, int]], num_processes: Optional[int], - gpus: Optional[List[int]], + gpus: Optional[Union[List[int], str, int]], ipus: Optional[int], - tpu_cores: Optional[Union[int, List[int]]], + tpu_cores: Optional[Union[List[int], str, int]], ) -> None: """Sets the `device_flag` and `accelerator_flag `based on num_processes, gpus, ipus, tpu_cores.""" - self._gpus = gpus - self._tpu_cores = tpu_cores + self._gpus: Optional[Union[List[int], str, int]] = gpus + self._tpu_cores: Optional[Union[List[int], str, int]] = tpu_cores gpus = device_parser.parse_gpu_ids(gpus) tpu_cores = device_parser.parse_tpu_cores(tpu_cores) deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores @@ -434,7 +439,7 @@ def _choose_accelerator(self) -> str: return "cpu" def _set_parallel_devices_and_init_accelerator(self) -> None: - self._parallel_devices = [] + self._parallel_devices: List[Union[int, torch.device]] = [] if isinstance(self._accelerator_flag, Accelerator): self.accelerator: Accelerator = self._accelerator_flag elif self._accelerator_flag == "tpu": @@ -444,7 +449,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: if isinstance(self._device_flag, int): self._parallel_devices = list(range(self._device_flag)) else: - self._parallel_devices = self._device_flag + self._parallel_devices = self._device_flag # type: ignore[assignment] elif self._accelerator_flag == "ipu": self.accelerator = IPUAccelerator() @@ -460,7 +465,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: if isinstance(self._device_flag, int) or isinstance(self._device_flag, str): self._device_flag = int(self._device_flag) self._parallel_devices = ( - [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] + [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] # type: ignore if self._device_flag != 0 else [] ) @@ -482,7 +487,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._gpus = self._device_flag if not self._gpus else self._gpus self._tpu_cores = self._device_flag if not self._tpu_cores else self._tpu_cores - def _choose_and_init_cluster_environment(self) -> None: + def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag if self._is_slurm_managing_tasks(): @@ -497,7 +502,7 @@ def _choose_and_init_cluster_environment(self) -> None: def _is_sharded_training_type(self) -> bool: return isinstance(self._strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)) - def _is_slurm_managing_tasks(self): + def _is_slurm_managing_tasks(self) -> bool: """used by choosing cluster enviroment.""" if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash": return False @@ -506,7 +511,7 @@ def _is_slurm_managing_tasks(self): num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) return num_slurm_tasks == total_requested_devices - def _choose_strategy(self) -> str: + def _choose_strategy(self) -> Union[Strategy, str]: if self._accelerator_flag == "ipu": return IPUStrategy.strategy_name if self._accelerator_flag == "tpu": @@ -514,7 +519,7 @@ def _choose_strategy(self) -> str: return TPUSpawnStrategy.strategy_name else: # TODO: lazy initialized device, then here could be self._strategy_flag = "single_tpu_device" - return SingleTPUStrategy(device=self._parallel_devices[0]) + return SingleTPUStrategy(device=self._parallel_devices[0]) # type: ignore if _HOROVOD_AVAILABLE and ("OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ): return HorovodStrategy.strategy_name if self._num_nodes_flag > 1: @@ -526,7 +531,7 @@ def _choose_strategy(self) -> str: else "cpu" ) # TODO: lazy initialized device, then here could be self._strategy_flag = "single_device" - return SingleDeviceStrategy(device=device) + return SingleDeviceStrategy(device=device) # type: ignore if len(self._parallel_devices) > 1: return DDPSpawnStrategy.strategy_name @@ -553,6 +558,8 @@ def _check_strategy_and_fallback(self) -> None: rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) + self._accelerator_flag = "cpu" + self.accelerator = CPUAccelerator() if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() ): @@ -603,19 +610,21 @@ def _check_and_init_precision(self) -> PrecisionPlugin: return self._precision_plugin_flag if isinstance(self.accelerator, IPUAccelerator): - return IPUPrecisionPlugin(self._precision_flag) + return IPUPrecisionPlugin(self._precision_flag) # type: ignore if isinstance(self.accelerator, TPUAccelerator): if self._precision_flag == 32: return TPUPrecisionPlugin() elif self._precision_flag in (16, "bf16"): if self._precision_flag == 16: rank_zero_warn( - f"You passed `Trainer(accelerator='tpu', precision=16)` but {self._amp_type_flag.value} AMP" - f" is not supported with TPUs. Using `precision='bf16'` instead." + "You passed `Trainer(accelerator='tpu', precision=16)` but AMP" + " is not supported with TPUs. Using `precision='bf16'` instead." ) return TPUBf16PrecisionPlugin() if isinstance(self.strategy, DeepSpeedStrategy): - return DeepSpeedPrecisionPlugin(self._precision_flag, self._amp_type_flag, self._amp_level_flag) + return DeepSpeedPrecisionPlugin( + self._precision_flag, self._amp_type_flag, self._amp_level_flag + ) # type: ignore if self._precision_flag == 32: return PrecisionPlugin() @@ -631,7 +640,7 @@ def _check_and_init_precision(self) -> PrecisionPlugin: if self._precision_flag in (16, "bf16"): rank_zero_info( - f"Using 16bit {self._amp_type_flag.value} Automatic Mixed Precision (AMP)" + f"Using 16bit {self._amp_type_flag.value} Automatic Mixed Precision (AMP)" # type: ignore if self._precision_flag == 16 else "Using bfloat16 Automatic Mixed Precision (AMP)" ) @@ -646,7 +655,7 @@ def _check_and_init_precision(self) -> PrecisionPlugin: return NativeMixedPrecisionPlugin(self._precision_flag, device) if self._amp_type_flag == AMPType.APEX: - return ApexMixedPrecisionPlugin(self._amp_level_flag) + return ApexMixedPrecisionPlugin(self._amp_level_flag) # type: ignore raise RuntimeError("No precision set") @@ -683,7 +692,7 @@ def _validate_precision_choice(self) -> None: ) if self._precision_flag == "bf16" and self._amp_type_flag != AMPType.NATIVE: raise MisconfigurationException( - f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but " + f"You passed `Trainer(amp_type={self._amp_type_flag.value!r}, precision='bf16')` but " # type: ignore "it's not supported. Try using `amp_type='native'` instead." ) if self._precision_flag in (16, "bf16") and self._amp_type_flag == AMPType.APEX: @@ -788,7 +797,7 @@ def devices(self) -> int: @property def tpu_cores(self) -> Optional[Union[List[int], int]]: if isinstance(self.accelerator, TPUAccelerator): - return self._tpu_cores + return self._tpu_cores # type: ignore return 0 @property diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 6fa9ace7f20ec..1e51c5479bdc7 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -21,7 +21,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException -def determine_root_gpu_device(gpus: List[int]) -> Optional[int]: +def determine_root_gpu_device(gpus: List[Union[int, torch.device]]) -> Optional[Union[int, torch.device]]: """ Args: gpus: non-empty list of ints representing which gpus to use From cc684f14fcb118b291dfc6ccf0b1bce70e050545 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 16 Feb 2022 09:54:19 -0800 Subject: [PATCH 54/69] address comments and fix mypy --- .../trainer/connectors/accelerator_connector.py | 10 +++++----- pytorch_lightning/trainer/trainer.py | 6 ++++-- tests/models/test_gpu.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index dbfd885235e42..4a0793abfc068 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -181,7 +181,7 @@ def __init__( self._set_parallel_devices_and_init_accelerator() # 3. Instantiate ClusterEnvironment - self.cluster_environment = self._choose_and_init_cluster_environment() + self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment() # 4. Instantiate Strategy - Part 1 if self._strategy_flag is None: @@ -623,8 +623,8 @@ def _check_and_init_precision(self) -> PrecisionPlugin: return TPUBf16PrecisionPlugin() if isinstance(self.strategy, DeepSpeedStrategy): return DeepSpeedPrecisionPlugin( - self._precision_flag, self._amp_type_flag, self._amp_level_flag - ) # type: ignore + self._precision_flag, self._amp_type_flag, self._amp_level_flag # type: ignore + ) if self._precision_flag == 32: return PrecisionPlugin() @@ -824,8 +824,8 @@ def gpus(self) -> Optional[Union[List[int], str, int]]: return self._gpus @property - def parallel_device_ids(self) -> Optional[List[int]]: - return [i for i in range(len(self.parallel_devices))] if isinstance(self.accelerator, GPUAccelerator) else None + def parallel_device_ids(self) -> List[int]: + return [i for i in range(len(self.parallel_devices))] if isinstance(self.accelerator, GPUAccelerator) else [] @property def is_distributed(self) -> bool: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b4fec24b0f101..0430e3d30558c 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1969,7 +1969,7 @@ def should_rank_save_checkpoint(self) -> bool: ) @property - def _strategy_type(self) -> Optional[int]: + def _strategy_type(self) -> Optional[str]: return self.strategy.strategy_name @property @@ -2006,7 +2006,9 @@ def devices(self) -> Optional[Union[List[int], str, int]]: @property def data_parallel_device_ids(self) -> Optional[List[int]]: - return self._accelerator_connector.parallel_device_ids + return ( + self._accelerator_connector.parallel_device_ids if self._accelerator_connector.parallel_device_ids else None + ) @property def lightning_module(self) -> "pl.LightningModule": diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index c494c0c1c18e6..d17322e191ff1 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -242,7 +242,7 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus sanitizing the gpus as only one of the GPUs is visible.""" trainer = Trainer(gpus=gpus) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) - assert trainer._accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) assert trainer.gpus == gpus From ce18f52b60c0d0582ed08fe88f661bbbaf34fcb2 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 17 Feb 2022 08:19:10 +0530 Subject: [PATCH 55/69] Updates to attributes --- .../connectors/accelerator_connector.py | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 4a0793abfc068..7bb665a75864c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -144,9 +144,9 @@ def __init__( # 1. Parsing flags # Get registered strategies, built-in accelerators and precision plugins - self._existing_strategies_str = StrategyRegistry.available_strategies() - self._existing_accelerator_type = ("tpu", "ipu", "gpu", "cpu") - self._supported_precision_type = ("16", "32", "64", "bf16", "mixed") + self._registered_strategies = StrategyRegistry.available_strategies() + self._accelerator_types = ("tpu", "ipu", "gpu", "cpu") + self._precision_types = ("16", "32", "64", "bf16", "mixed") # Raise an exception if there are conflicts between flags # Set each valid flag to `self._x_flag` after validation @@ -237,11 +237,7 @@ def _check_config_and_set_final_flags( f"Incompatible values set in `strategy` and `accelerator` arguments." f"Received both strategy={strategy} and accelerator={accelerator}" ) - if ( - isinstance(accelerator, str) - and accelerator in self._existing_strategies_str - and strategy != accelerator - ): + if isinstance(accelerator, str) and accelerator in self._registered_strategies and strategy != accelerator: raise MisconfigurationException( "strategy str already set through strategy flag, but have also passed in through accelerator" ) @@ -252,20 +248,16 @@ def _check_config_and_set_final_flags( f"You have passed `Trainer(strategy)`" f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) - if isinstance(plugin, str) and plugin in self._existing_strategies_str: + if isinstance(plugin, str) and plugin in self._registered_strategies: raise MisconfigurationException( f"You have passed `Trainer(strategy)`" f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) if accelerator is not None: - if ( - accelerator in self._existing_accelerator_type - or accelerator == "auto" - or isinstance(accelerator, Accelerator) - ): + if accelerator in self._accelerator_types or accelerator == "auto" or isinstance(accelerator, Accelerator): self._accelerator_flag = accelerator - elif accelerator in self._existing_strategies_str or isinstance(accelerator, Strategy): + elif accelerator in self._registered_strategies or isinstance(accelerator, Strategy): rank_zero_deprecation( f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." @@ -275,16 +267,15 @@ def _check_config_and_set_final_flags( self._strategy_flag = accelerator if precision is not None: - if str(precision) not in self._supported_precision_type: + if str(precision) not in self._precision_types: raise MisconfigurationException( - f"Precision {repr(precision)} is invalid. " - f"Allowed precision values: {self._supported_precision_type}" + f"Precision {repr(precision)} is invalid. " f"Allowed precision values: {self._precision_types}" ) self._precision_flag = precision if plugins: for plugin in plugins: - if isinstance(plugin, Strategy) or isinstance(plugin, str) and plugin in self._existing_strategies_str: + if isinstance(plugin, Strategy) or isinstance(plugin, str) and plugin in self._registered_strategies: self._strategy_flag = plugin rank_zero_deprecation( f"Passing {plugin} `strategy` to the `plugins` flag in Trainer has been deprecated" @@ -293,7 +284,7 @@ def _check_config_and_set_final_flags( elif isinstance(plugin, PrecisionPlugin): self._precision_plugin_flag = plugin - elif isinstance(plugin, str) and plugin in self._supported_precision_type: + elif isinstance(plugin, str) and plugin in self._precision_types: self._precision_flag = plugin elif isinstance(plugin, CheckpointIO): self.checkpoint_io = plugin From 88db8306346b89d62f7a839715ad8b783e40d6de Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 17 Feb 2022 08:43:58 +0530 Subject: [PATCH 56/69] Improve exceptions --- .../trainer/connectors/accelerator_connector.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7bb665a75864c..f6ccb903a57b1 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -209,7 +209,7 @@ def _check_config_and_set_final_flags( 1. strategy: strategy, accelerator and plugin can all be set to strategies 2. accelerator: if the value of the accelerator argument is a type of accelerator (instance or string), - set self.accelerator_flag accordingly. If the value is strategy related (instance or string), + set self._accelerator_flag accordingly. If the value is strategy related (instance or string), it gets handled by 1. 3. precision: The final value of the precision flag may be determined either by the precision argument or by a plugin instance. @@ -239,18 +239,19 @@ def _check_config_and_set_final_flags( ) if isinstance(accelerator, str) and accelerator in self._registered_strategies and strategy != accelerator: raise MisconfigurationException( - "strategy str already set through strategy flag, but have also passed in through accelerator" + f"strategy {strategy} already set through `strategy` flag," + f" but have also passed {accelerator} in through the accelerator flag." ) if plugins: for plugin in plugins: if isinstance(plugin, Strategy): raise MisconfigurationException( - f"You have passed `Trainer(strategy)`" + f"You have passed `Trainer(strategy={strategy})`" f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) if isinstance(plugin, str) and plugin in self._registered_strategies: raise MisconfigurationException( - f"You have passed `Trainer(strategy)`" + f"You have passed `Trainer(strategy={strategy})`" f" and you can only specify one strategy, but you have passed {plugin} as a plugin." ) @@ -269,7 +270,7 @@ def _check_config_and_set_final_flags( if precision is not None: if str(precision) not in self._precision_types: raise MisconfigurationException( - f"Precision {repr(precision)} is invalid. " f"Allowed precision values: {self._precision_types}" + f"Precision {repr(precision)} is invalid. Allowed precision values: {self._precision_types}" ) self._precision_flag = precision @@ -292,7 +293,7 @@ def _check_config_and_set_final_flags( self._cluster_environment_flag = plugin else: raise MisconfigurationException( - f"Found invalid type for plugin {plugin}. Expected a precision or training type plugin." + f"Found invalid type for plugin {plugin}. Expected a precision plugin or training strategy." ) # handle the case when the user passes in a strategy instance which has an accelerator, precision, From ee70db83c3b7229e7880ec49a47dc3449aece1d8 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 17 Feb 2022 08:54:35 +0530 Subject: [PATCH 57/69] Updates to attributes --- .../connectors/accelerator_connector.py | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index f6ccb903a57b1..8cb1f03ccdcc3 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -298,7 +298,7 @@ def _check_config_and_set_final_flags( # handle the case when the user passes in a strategy instance which has an accelerator, precision, # checkpoint io or cluster env set up - # TODO: @awaelchli imporve the error messages below + # TODO: @awaelchli improve the error messages below if self._strategy_flag and isinstance(self._strategy_flag, Strategy): if self._strategy_flag._accelerator: if self._accelerator_flag: @@ -359,14 +359,14 @@ def _check_device_config_and_set_final_flags( if devices in (0, "0", "0,"): raise MisconfigurationException(f"You passed `devices={devices}`, please set a number > 0") - self._device_flag = devices + self._devices_flag = devices # TODO: Delete this method num_processes, gpus, ipus and tpu_cores get removed self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( devices, num_processes, gpus, ipus, tpu_cores ) - if self._device_flag == "auto" and self._accelerator_flag is None: + if self._devices_flag == "auto" and self._accelerator_flag is None: raise MisconfigurationException( f"You passed `devices={devices}` but haven't specified" " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" @@ -399,7 +399,7 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( ) > 1: # TODO: @awaelchli improve error message rank_zero_warn("more than one device specific flag has been set") - self._device_flag = deprecated_devices_specific_flag + self._devices_flag = deprecated_devices_specific_flag if self._accelerator_flag is None: # set accelerator type based on num_processes, gpus, ipus, tpu_cores @@ -436,48 +436,48 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self.accelerator: Accelerator = self._accelerator_flag elif self._accelerator_flag == "tpu": self.accelerator = TPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = TPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int): - self._parallel_devices = list(range(self._device_flag)) + if self._devices_flag == "auto" or not self._devices_flag: + self._devices_flag = TPUAccelerator.auto_device_count() + if isinstance(self._devices_flag, int): + self._parallel_devices = list(range(self._devices_flag)) else: - self._parallel_devices = self._device_flag # type: ignore[assignment] + self._parallel_devices = self._devices_flag # type: ignore[assignment] elif self._accelerator_flag == "ipu": self.accelerator = IPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = IPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int): - self._parallel_devices = list(range(self._device_flag)) + if self._devices_flag == "auto" or not self._devices_flag: + self._devices_flag = IPUAccelerator.auto_device_count() + if isinstance(self._devices_flag, int): + self._parallel_devices = list(range(self._devices_flag)) elif self._accelerator_flag == "gpu": self.accelerator = GPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = GPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int) or isinstance(self._device_flag, str): - self._device_flag = int(self._device_flag) + if self._devices_flag == "auto" or not self._devices_flag: + self._devices_flag = GPUAccelerator.auto_device_count() + if isinstance(self._devices_flag, int) or isinstance(self._devices_flag, str): + self._devices_flag = int(self._devices_flag) self._parallel_devices = ( - [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._device_flag)] # type: ignore - if self._device_flag != 0 + [torch.device("cuda", i) for i in device_parser.parse_gpu_ids(self._devices_flag)] # type: ignore + if self._devices_flag != 0 else [] ) else: - self._parallel_devices = [torch.device("cuda", i) for i in self._device_flag] + self._parallel_devices = [torch.device("cuda", i) for i in self._devices_flag] elif self._accelerator_flag == "cpu": self.accelerator = CPUAccelerator() - if self._device_flag == "auto" or not self._device_flag: - self._device_flag = CPUAccelerator.auto_device_count() - if isinstance(self._device_flag, int): - self._parallel_devices = [torch.device("cpu")] * self._device_flag + if self._devices_flag == "auto" or not self._devices_flag: + self._devices_flag = CPUAccelerator.auto_device_count() + if isinstance(self._devices_flag, int): + self._parallel_devices = [torch.device("cpu")] * self._devices_flag else: rank_zero_warn( "The flag `devices` must be an int with `accelerator='cpu'`," - f" got `devices={self._device_flag}` instead." + f" got `devices={self._devices_flag}` instead." ) - self._gpus = self._device_flag if not self._gpus else self._gpus - self._tpu_cores = self._device_flag if not self._tpu_cores else self._tpu_cores + self._gpus = self._devices_flag if not self._gpus else self._gpus + self._tpu_cores = self._devices_flag if not self._tpu_cores else self._tpu_cores def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): @@ -542,7 +542,7 @@ def _check_strategy_and_fallback(self) -> None: "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) - if self._device_flag == 1 and self._num_nodes_flag > 1: + if self._devices_flag == 1 and self._num_nodes_flag > 1: strategy_flag = DDPStrategy.strategy_name else: strategy_flag = "ddp_spawn" From c516830008cacc131b0df6bbd38f86fa6b44fbe4 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 17 Feb 2022 09:30:05 +0530 Subject: [PATCH 58/69] Add utility methods --- .../connectors/accelerator_connector.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 8cb1f03ccdcc3..1224bbd07a87d 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -175,7 +175,7 @@ def __init__( # 2. Instantiate Accelerator # handle `auto` and `None` - self._special_handle_for_ipu() + self._set_accelerator_if_ipu_strategy_is_passed() if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._accelerator_flag = self._choose_accelerator() self._set_parallel_devices_and_init_accelerator() @@ -380,7 +380,7 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( ipus: Optional[int], tpu_cores: Optional[Union[List[int], str, int]], ) -> None: - """Sets the `device_flag` and `accelerator_flag `based on num_processes, gpus, ipus, tpu_cores.""" + """Sets the `devices_flag` and `accelerator_flag `based on num_processes, gpus, ipus, tpu_cores.""" self._gpus: Optional[Union[List[int], str, int]] = gpus self._tpu_cores: Optional[Union[List[int], str, int]] = tpu_cores gpus = device_parser.parse_gpu_ids(gpus) @@ -412,7 +412,7 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( if num_processes: self._accelerator_flag = "cpu" - def _special_handle_for_ipu(self) -> None: + def _set_accelerator_if_ipu_strategy_is_passed(self) -> None: # current logic only apply to object config # TODO this logic should apply to both str and object config if isinstance(self._strategy_flag, IPUStrategy): @@ -427,17 +427,16 @@ def _choose_accelerator(self) -> str: return "ipu" if torch.cuda.is_available() and torch.cuda.device_count() > 0: return "gpu" - # [RFC] this is current logic, if accelerator=None, default cpu? return "cpu" def _set_parallel_devices_and_init_accelerator(self) -> None: self._parallel_devices: List[Union[int, torch.device]] = [] + if isinstance(self._accelerator_flag, Accelerator): self.accelerator: Accelerator = self._accelerator_flag elif self._accelerator_flag == "tpu": self.accelerator = TPUAccelerator() - if self._devices_flag == "auto" or not self._devices_flag: - self._devices_flag = TPUAccelerator.auto_device_count() + self._set_devices_flag_if_auto_passed() if isinstance(self._devices_flag, int): self._parallel_devices = list(range(self._devices_flag)) else: @@ -445,15 +444,13 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: elif self._accelerator_flag == "ipu": self.accelerator = IPUAccelerator() - if self._devices_flag == "auto" or not self._devices_flag: - self._devices_flag = IPUAccelerator.auto_device_count() + self._set_devices_flag_if_auto_passed() if isinstance(self._devices_flag, int): self._parallel_devices = list(range(self._devices_flag)) elif self._accelerator_flag == "gpu": self.accelerator = GPUAccelerator() - if self._devices_flag == "auto" or not self._devices_flag: - self._devices_flag = GPUAccelerator.auto_device_count() + self._set_devices_flag_if_auto_passed() if isinstance(self._devices_flag, int) or isinstance(self._devices_flag, str): self._devices_flag = int(self._devices_flag) self._parallel_devices = ( @@ -466,8 +463,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: elif self._accelerator_flag == "cpu": self.accelerator = CPUAccelerator() - if self._devices_flag == "auto" or not self._devices_flag: - self._devices_flag = CPUAccelerator.auto_device_count() + self._set_devices_flag_if_auto_passed() if isinstance(self._devices_flag, int): self._parallel_devices = [torch.device("cpu")] * self._devices_flag else: @@ -479,6 +475,10 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._gpus = self._devices_flag if not self._gpus else self._gpus self._tpu_cores = self._devices_flag if not self._tpu_cores else self._tpu_cores + def _set_devices_flag_if_auto_passed(self) -> None: + if self._devices_flag == "auto" or not self._devices_flag: + self._devices_flag = self.accelerator.auto_device_count() + def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag @@ -490,10 +490,6 @@ def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: return env_type() return LightningEnvironment() - @property - def _is_sharded_training_type(self) -> bool: - return isinstance(self._strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)) - def _is_slurm_managing_tasks(self) -> bool: """used by choosing cluster enviroment.""" if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash": From 8b0721825415efcaf5f342fc062b476628e925f2 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Thu, 17 Feb 2022 10:57:01 +0530 Subject: [PATCH 59/69] Handle zero/empty list values for devices flag --- .../trainer/connectors/accelerator_connector.py | 12 ++++++------ tests/accelerators/test_accelerator_connector.py | 8 ++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 1224bbd07a87d..413009a25c373 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -355,16 +355,16 @@ def _check_device_config_and_set_final_flags( tpu_cores: Optional[Union[List[int], int]], ) -> None: self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 - - if devices in (0, "0", "0,"): - raise MisconfigurationException(f"You passed `devices={devices}`, please set a number > 0") - self._devices_flag = devices - # TODO: Delete this method num_processes, gpus, ipus and tpu_cores get removed + # TODO: Delete this method when num_processes, gpus, ipus and tpu_cores gets removed self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( devices, num_processes, gpus, ipus, tpu_cores ) + + if self._devices_flag in ([], 0, "0", "0,"): + rank_zero_warn(f"You passed `devices={devices}`, switching to `cpu` accelerator") + self._accelerator_flag = "cpu" if self._devices_flag == "auto" and self._accelerator_flag is None: raise MisconfigurationException( @@ -380,7 +380,7 @@ def _map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( ipus: Optional[int], tpu_cores: Optional[Union[List[int], str, int]], ) -> None: - """Sets the `devices_flag` and `accelerator_flag `based on num_processes, gpus, ipus, tpu_cores.""" + """Sets the `devices_flag` and `accelerator_flag` based on num_processes, gpus, ipus, tpu_cores.""" self._gpus: Optional[Union[List[int], str, int]] = gpus self._tpu_cores: Optional[Union[List[int], str, int]] = tpu_cores gpus = device_parser.parse_gpu_ids(gpus) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 2df4a8e1b63da..946da00e4d1f0 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -942,3 +942,11 @@ def test_devices_auto_choice_gpu(is_gpu_available_mock, device_count_mock): trainer = Trainer(accelerator="auto", devices="auto") assert trainer.devices == 2 assert trainer.gpus == 2 + + +def test_passing_zero_and_empty_list_to_devices_flag(): + with pytest.warns(UserWarning, match=r"switching to `cpu` accelerator"): + Trainer(accelerator="gpu", devices=0) + + with pytest.warns(UserWarning, match=r"switching to `cpu` accelerator"): + Trainer(accelerator="gpu", devices=[]) From caaf390baec507a51aeae3762df7bac80586d6b8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Feb 2022 05:28:23 +0000 Subject: [PATCH 60/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +- tests/accelerators/test_accelerator_connector.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 413009a25c373..7e3aa4ef32af7 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -361,7 +361,7 @@ def _check_device_config_and_set_final_flags( self._map_deprecated_devices_specfic_info_to_accelerator_and_device_flag( devices, num_processes, gpus, ipus, tpu_cores ) - + if self._devices_flag in ([], 0, "0", "0,"): rank_zero_warn(f"You passed `devices={devices}`, switching to `cpu` accelerator") self._accelerator_flag = "cpu" diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 946da00e4d1f0..7401fa477d982 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -942,11 +942,11 @@ def test_devices_auto_choice_gpu(is_gpu_available_mock, device_count_mock): trainer = Trainer(accelerator="auto", devices="auto") assert trainer.devices == 2 assert trainer.gpus == 2 - + def test_passing_zero_and_empty_list_to_devices_flag(): with pytest.warns(UserWarning, match=r"switching to `cpu` accelerator"): Trainer(accelerator="gpu", devices=0) - + with pytest.warns(UserWarning, match=r"switching to `cpu` accelerator"): Trainer(accelerator="gpu", devices=[]) From cd12345cd9d75b93e87d2427f1e84bcb44d8d5b5 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Thu, 17 Feb 2022 09:22:55 -0800 Subject: [PATCH 61/69] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- pytorch_lightning/strategies/parallel.py | 8 -------- pytorch_lightning/utilities/device_parser.py | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/pytorch_lightning/strategies/parallel.py b/pytorch_lightning/strategies/parallel.py index d8a8ab50abe2d..11207065b7e21 100644 --- a/pytorch_lightning/strategies/parallel.py +++ b/pytorch_lightning/strategies/parallel.py @@ -85,14 +85,6 @@ def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=len(self.parallel_devices), rank=self.global_rank) return distributed_sampler_kwargs - @property - def parallel_devices(self): - return self._parallel_devices - - @parallel_devices.setter - def parallel_devices(self, parallel_devices): - self._parallel_devices = parallel_devices - def reconciliate_processes(self, trace: str): """Function to re-conciliate processes on failure.""" diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 1e51c5479bdc7..17e2f70aa626a 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -21,7 +21,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException -def determine_root_gpu_device(gpus: List[Union[int, torch.device]]) -> Optional[Union[int, torch.device]]: +def determine_root_gpu_device(gpus: List[_DEVICE]) -> Optional[_DEVICE]: """ Args: gpus: non-empty list of ints representing which gpus to use From a442852753961232915104f19f7d3271a5d24c34 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 17 Feb 2022 10:47:24 -0800 Subject: [PATCH 62/69] address comments --- pytorch_lightning/accelerators/gpu.py | 3 ++- pytorch_lightning/strategies/tpu_spawn.py | 2 +- .../trainer/connectors/accelerator_connector.py | 12 ++++++++++-- pytorch_lightning/utilities/device_parser.py | 3 ++- tests/accelerators/test_accelerator_connector.py | 12 ++++-------- tests/utilities/test_cli.py | 4 ++-- 6 files changed, 21 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index aa8b0d56dbf63..6fa9fa94594af 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -82,7 +82,8 @@ def auto_device_count() -> int: @staticmethod def is_available() -> bool: - return torch.cuda.device_count() > 0 + print(torch.cuda.is_available() and torch.cuda.device_count() > 0) + return torch.cuda.is_available() and torch.cuda.device_count() > 0 def get_nvidia_gpu_stats(device: _DEVICE) -> dict[str, float]: diff --git a/pytorch_lightning/strategies/tpu_spawn.py b/pytorch_lightning/strategies/tpu_spawn.py index b43267b5c91d6..867624fd2151e 100644 --- a/pytorch_lightning/strategies/tpu_spawn.py +++ b/pytorch_lightning/strategies/tpu_spawn.py @@ -52,7 +52,7 @@ class TPUSpawnStrategy(DDPSpawnStrategy): """Strategy for training multiple TPU devices using the :func:`torch.multiprocessing.spawn` method.""" - strategy_name = "tpu_spawn_strategy" + strategy_name = "tpu_spawn" def __init__( self, diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7e3aa4ef32af7..a3c626cecfe1c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -451,6 +451,8 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: elif self._accelerator_flag == "gpu": self.accelerator = GPUAccelerator() self._set_devices_flag_if_auto_passed() + # TODO add device availablity check for all devices, not only GPU + self._check_device_availability() if isinstance(self._devices_flag, int) or isinstance(self._devices_flag, str): self._devices_flag = int(self._devices_flag) self._parallel_devices = ( @@ -459,7 +461,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: else [] ) else: - self._parallel_devices = [torch.device("cuda", i) for i in self._devices_flag] + self._parallel_devices = [torch.device("cuda", i) for i in self._devices_flag] # type: ignore elif self._accelerator_flag == "cpu": self.accelerator = CPUAccelerator() @@ -479,6 +481,12 @@ def _set_devices_flag_if_auto_passed(self) -> None: if self._devices_flag == "auto" or not self._devices_flag: self._devices_flag = self.accelerator.auto_device_count() + def _check_device_availability(self) -> None: + if not self.accelerator.is_available(): + raise MisconfigurationException( + f"You requested {self._accelerator_flag}, " f"but {self._accelerator_flag} is not available" + ) + def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag @@ -514,7 +522,7 @@ def _choose_strategy(self) -> Union[Strategy, str]: return DDPStrategy.strategy_name if len(self._parallel_devices) <= 1: device = ( - device_parser.determine_root_gpu_device(self._parallel_devices) + device_parser.determine_root_gpu_device(self._parallel_devices) # type: ignore if self._accelerator_flag == "gpu" else "cpu" ) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 17e2f70aa626a..d7b8a319ea4d2 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -19,6 +19,7 @@ from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import _TPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.types import _DEVICE def determine_root_gpu_device(gpus: List[_DEVICE]) -> Optional[_DEVICE]: @@ -164,7 +165,7 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]: for gpu in gpus: if gpu not in all_available_gpus: raise MisconfigurationException( - f"You requested GPUs: {gpus}\n But your machine only has: {all_available_gpus}" + f"You requested gpu: {gpus}\n But your machine only has: {all_available_gpus}" ) return gpus diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 7401fa477d982..69a40c2adc997 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -453,12 +453,11 @@ def test_accelerator_cpu(mack_gpu_avalible): assert trainer._device_type == "cpu" assert isinstance(trainer.accelerator, CPUAccelerator) - with pytest.raises(MisconfigurationException): + with pytest.raises(MisconfigurationException, match="You requested gpu"): trainer = Trainer(gpus=1) - # with pytest.raises(MisconfigurationException): - # trainer = Trainer(accelerator="gpu") - - with pytest.raises(MisconfigurationException, match="You requested GPUs:"): + with pytest.raises(MisconfigurationException, match="You requested gpu, but gpu is not available"): + trainer = Trainer(accelerator="gpu") + with pytest.raises(MisconfigurationException, match="You requested gpu:"): trainer = Trainer(accelerator="cpu", gpus=1) @@ -470,9 +469,6 @@ def test_accelerator_gpu(): assert trainer._device_type == "gpu" assert isinstance(trainer.accelerator, GPUAccelerator) - # with pytest.raises( - # MisconfigurationException, match="You passed `accelerator='gpu'`, but you didn't pass `gpus` to `Trainer`" - # ): trainer = Trainer(accelerator="gpu") trainer = Trainer(accelerator="auto", gpus=1) diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 8992f0c1accd9..2803c0c4601c1 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -582,8 +582,8 @@ def on_fit_start(self): ( # dict(strategy="ddp_spawn") # dict(strategy="ddp") - # !! old accl_conn will choose singleDeviceStrategy for both strategy=ddp/ddp_spawn - # this test never worked with DDPSpawnStrategy + # the previous accl_conn will choose singleDeviceStrategy for both strategy=ddp/ddp_spawn + # TODO revisit this test as it never worked with DDP or DDPSpawn dict(strategy="single_device"), pytest.param({"tpu_cores": 1}, marks=RunIf(tpu=True)), ), From 3152f81728bb29e77fab03abdc1dec6d50514fbb Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 17 Feb 2022 10:50:25 -0800 Subject: [PATCH 63/69] minor comments change --- tests/accelerators/test_accelerator_connector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 69a40c2adc997..78fb7b0c30b48 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -550,7 +550,6 @@ def test_accelerator_gpu_with_gpus_priority(): def test_validate_accelerator_and_devices(): - # with pytest.raises(MisconfigurationException, match="You passed `devices=2` but haven't specified"): trainer = Trainer(accelerator="ddp_cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert trainer.num_processes == 2 From 2c2e5ace657318607eb52f5d29d7e241dfde1d9a Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 17 Feb 2022 11:24:11 -0800 Subject: [PATCH 64/69] fix tests --- .../trainer/connectors/accelerator_connector.py | 12 +++--------- tests/accelerators/test_accelerator_connector.py | 5 +++-- tests/strategies/test_deepspeed_strategy.py | 7 ++++++- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index a3c626cecfe1c..e68806a024994 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -430,6 +430,7 @@ def _choose_accelerator(self) -> str: return "cpu" def _set_parallel_devices_and_init_accelerator(self) -> None: + # TODO add device availability check self._parallel_devices: List[Union[int, torch.device]] = [] if isinstance(self._accelerator_flag, Accelerator): @@ -451,8 +452,6 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: elif self._accelerator_flag == "gpu": self.accelerator = GPUAccelerator() self._set_devices_flag_if_auto_passed() - # TODO add device availablity check for all devices, not only GPU - self._check_device_availability() if isinstance(self._devices_flag, int) or isinstance(self._devices_flag, str): self._devices_flag = int(self._devices_flag) self._parallel_devices = ( @@ -481,12 +480,6 @@ def _set_devices_flag_if_auto_passed(self) -> None: if self._devices_flag == "auto" or not self._devices_flag: self._devices_flag = self.accelerator.auto_device_count() - def _check_device_availability(self) -> None: - if not self.accelerator.is_available(): - raise MisconfigurationException( - f"You requested {self._accelerator_flag}, " f"but {self._accelerator_flag} is not available" - ) - def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag @@ -651,7 +644,8 @@ def _check_and_init_precision(self) -> PrecisionPlugin: return NativeMixedPrecisionPlugin(self._precision_flag, device) if self._amp_type_flag == AMPType.APEX: - return ApexMixedPrecisionPlugin(self._amp_level_flag) # type: ignore + self._amp_level_flag = self._amp_level_flag or "O2" + return ApexMixedPrecisionPlugin(self._amp_level_flag) raise RuntimeError("No precision set") diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 78fb7b0c30b48..526d94fa3c829 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -455,8 +455,9 @@ def test_accelerator_cpu(mack_gpu_avalible): with pytest.raises(MisconfigurationException, match="You requested gpu"): trainer = Trainer(gpus=1) - with pytest.raises(MisconfigurationException, match="You requested gpu, but gpu is not available"): - trainer = Trainer(accelerator="gpu") + # TODO enable this test when add device availability check + # with pytest.raises(MisconfigurationException, match="You requested gpu, but gpu is not available"): + # trainer = Trainer(accelerator="gpu") with pytest.raises(MisconfigurationException, match="You requested gpu:"): trainer = Trainer(accelerator="cpu", gpus=1) diff --git a/tests/strategies/test_deepspeed_strategy.py b/tests/strategies/test_deepspeed_strategy.py index 5eed2578546ba..e5306b0942131 100644 --- a/tests/strategies/test_deepspeed_strategy.py +++ b/tests/strategies/test_deepspeed_strategy.py @@ -167,7 +167,12 @@ def test_deepspeed_precision_choice(amp_backend, precision, tmpdir): """ trainer = Trainer( - fast_dev_run=True, default_root_dir=tmpdir, strategy="deepspeed", amp_backend=amp_backend, precision=precision + fast_dev_run=True, + default_root_dir=tmpdir, + accelerator="gpu", + strategy="deepspeed", + amp_backend=amp_backend, + precision=precision, ) assert isinstance(trainer.strategy, DeepSpeedStrategy) From 5f32feb07eb62dc881def9bc8776c7382370dd1b Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 17 Feb 2022 13:44:35 -0800 Subject: [PATCH 65/69] minor fix --- pytorch_lightning/accelerators/gpu.py | 3 +-- tests/trainer/test_trainer_cli.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 6fa9fa94594af..aa8b0d56dbf63 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -82,8 +82,7 @@ def auto_device_count() -> int: @staticmethod def is_available() -> bool: - print(torch.cuda.is_available() and torch.cuda.device_count() > 0) - return torch.cuda.is_available() and torch.cuda.device_count() > 0 + return torch.cuda.device_count() > 0 def get_nvidia_gpu_stats(device: _DEVICE) -> dict[str, float]: diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index 330b0f75ffb61..b5713893f769b 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -163,7 +163,7 @@ def test_argparse_args_parsing_fast_dev_run(cli_args, expected): @pytest.mark.parametrize( ["cli_args", "expected_parsed", "expected_device_ids"], - [("", None, None), ("--accelerator gpu --devices 1", "1", [0])], + [("", None, None), ("--accelerator gpu --devices 1", "1", [0]), ("--accelerator gpu --devices 0,", "0,", None)], ) @RunIf(min_gpus=1) def test_argparse_args_parsing_devices(cli_args, expected_parsed, expected_device_ids): From f2ab1d6df2a9ae3686fa8b1996b18f0af5c6c23f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Feb 2022 21:46:18 +0000 Subject: [PATCH 66/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/strategies/ddp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index e13f8a3536bc8..b5d83478101f1 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -96,7 +96,6 @@ def __init__( self._sync_dir: Optional[str] = None self._rank_0_will_call_children_scripts: bool = False - @property def is_distributed(self) -> bool: return True From a6ff2c34c03222dd79f08af087785924ae068af2 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 17 Feb 2022 14:02:21 -0800 Subject: [PATCH 67/69] add _configure_launcher call to accl_conn --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index e68806a024994..20c5f485b4e71 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -711,6 +711,7 @@ def _lazy_init_strategy(self) -> None: self.strategy.sync_batchnorm = self.sync_batchnorm if hasattr(self.strategy, "set_world_ranks"): self.strategy.set_world_ranks() + self.strategy._configure_launcher() from pytorch_lightning.utilities import _IS_INTERACTIVE From 869e5710163803a729581ec1ce2d3dc144db1fbd Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Thu, 17 Feb 2022 14:49:39 -0800 Subject: [PATCH 68/69] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- tests/accelerators/test_accelerator_connector.py | 7 ++++--- tests/accelerators/test_ipu.py | 1 + tests/accelerators/test_tpu.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 526d94fa3c829..aabc21b10d20b 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -446,7 +446,7 @@ def test_accelerator_choice_multi_node_gpu( @mock.patch("torch.cuda.is_available", return_value=False) -def test_accelerator_cpu(mack_gpu_avalible): +def test_accelerator_cpu(_): trainer = Trainer(accelerator="cpu") @@ -471,6 +471,7 @@ def test_accelerator_gpu(): assert isinstance(trainer.accelerator, GPUAccelerator) trainer = Trainer(accelerator="gpu") + assert isinstance(trainer.accelerator, GPUAccelerator) trainer = Trainer(accelerator="auto", gpus=1) @@ -573,8 +574,8 @@ def test_devices_with_cpu_only_supports_integer(): with pytest.warns(UserWarning, match="The flag `devices` must be an int"): trainer = Trainer(accelerator="cpu", devices="1,3") - assert isinstance(trainer.accelerator, CPUAccelerator) - assert trainer.devices == 1 + assert isinstance(trainer.accelerator, CPUAccelerator) + assert trainer.devices == 1 @pytest.mark.parametrize("training_type", ["ddp2", "dp"]) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 40ceab7195219..b8f01815704f5 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -506,6 +506,7 @@ def test_accelerator_ipu(): assert isinstance(trainer.accelerator, IPUAccelerator) trainer = Trainer(accelerator="ipu") + assert isinstance(trainer.accelerator, IPUAccelerator) trainer = Trainer(accelerator="auto", ipus=8) diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index dc004f957dac1..d8f99ec4dcedb 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -91,6 +91,7 @@ def test_accelerator_tpu(): assert isinstance(trainer.accelerator, TPUAccelerator) trainer = Trainer(accelerator="tpu") + assert isinstance(trainer.accelerator, TPUAccelerator) @RunIf(tpu=True) From 9568f3b728b10bfd1b737f0445acf664653a159d Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Thu, 17 Feb 2022 14:59:32 -0800 Subject: [PATCH 69/69] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- pytorch_lightning/trainer/trainer.py | 2 +- tests/accelerators/test_accelerator_connector.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4360015ba537b..6ed5d6c31f719 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1964,7 +1964,7 @@ def should_rank_save_checkpoint(self) -> bool: ) @property - def _strategy_type(self) -> Optional[str]: + def _strategy_type(self) -> str: return self.strategy.strategy_name @property diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index aabc21b10d20b..76fa6d64f5a56 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -453,7 +453,7 @@ def test_accelerator_cpu(_): assert trainer._device_type == "cpu" assert isinstance(trainer.accelerator, CPUAccelerator) - with pytest.raises(MisconfigurationException, match="You requested gpu"): + with pytest.raises(MisconfigurationException, match="You requested gpu:"): trainer = Trainer(gpus=1) # TODO enable this test when add device availability check # with pytest.raises(MisconfigurationException, match="You requested gpu, but gpu is not available"):