From 9d0a88f6fd531bb1fad823f0fb8d83b54439c45c Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 3 Dec 2021 17:12:34 -0800 Subject: [PATCH 01/42] remove training_step() from accelerator --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f95ebaebac6c..aa3b7c00105d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -258,6 +258,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed methods `pre_dispatch`, `dispatch` and `post_dispatch` from the `Accelerator` ([#10885](https://github.com/PyTorchLightning/pytorch-lightning/pull/10885)) +- Removed method `training_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) + + +### Fixed + + - Removed method `training_step`, `test_step`, `validation_step` and `predict_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) From ac313dcde488192db20cbaf37cf2b77b1cf7c9f1 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 3 Dec 2021 18:54:37 -0800 Subject: [PATCH 02/42] remove test, val, predict step --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa3b7c00105d5..319159d871f72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -255,10 +255,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed method `setup_optimizers_in_pre_dispatch` from the `strategies` and achieve the same logic in `setup` and `pre_dispatch` methods ([#10906](https://github.com/PyTorchLightning/pytorch-lightning/pull/10906)) +<<<<<<< HEAD - Removed methods `pre_dispatch`, `dispatch` and `post_dispatch` from the `Accelerator` ([#10885](https://github.com/PyTorchLightning/pytorch-lightning/pull/10885)) - Removed method `training_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) +======= +- Removed method `training_step`, `test_step`, `validation_step` and `predict_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) +>>>>>>> c593c9f0f (remove test, val, predict step) ### Fixed From c22ce58794e8f2313497713565f9223344c0a6c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 6 Dec 2021 21:36:34 +0100 Subject: [PATCH 03/42] move --- pytorch_lightning/accelerators/accelerator.py | 58 ++----------------- pytorch_lightning/accelerators/cpu.py | 4 +- pytorch_lightning/accelerators/gpu.py | 10 +--- pytorch_lightning/lite/lite.py | 2 +- .../plugins/training_type/ddp.py | 1 + .../training_type/training_type_plugin.py | 13 ++++- tests/accelerators/test_ipu.py | 6 +- tests/accelerators/test_tpu.py | 8 +-- ..._ddp_fully_sharded_with_full_state_dict.py | 2 +- tests/plugins/test_ddp_plugin.py | 12 ++-- 10 files changed, 37 insertions(+), 79 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 18fd855c94a60..9557781d084f5 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -31,27 +31,14 @@ class Accelerator: - GPU - TPU - IPU - - Each Accelerator gets two plugins upon initialization: - One to handle differences from the training routine and one to handle different precisions. """ - def __init__(self, precision_plugin: Optional[PrecisionPlugin], training_type_plugin: TrainingTypePlugin) -> None: - """ - Args: - precision_plugin: the plugin to handle precision-specific parts - - .. deprecated:: - The ``precision_plugin`` parameter has been deprecated and will be removed soon. - Pass the precision plugin as a parameter to the ``TrainingTypePlugin`` instead. - - training_type_plugin: the plugin to handle different training routines - """ + def __init__(self) -> None: + super().__init__() - self.training_type_plugin = training_type_plugin - - if precision_plugin is not None: - self.training_type_plugin._precision_plugin = precision_plugin + @property + def root_device(self): + return None def setup_environment(self) -> None: """Setup any processes or distributed connections. @@ -59,7 +46,6 @@ def setup_environment(self) -> None: This is called before the LightningModule/DataModule setup hook which allows the user to access the accelerator environment before setup is complete. """ - self.training_type_plugin.setup_environment() def setup(self, trainer: "pl.Trainer") -> None: """Setup plugins for the trainer fit and creates optimizers. @@ -67,40 +53,6 @@ def setup(self, trainer: "pl.Trainer") -> None: Args: trainer: the trainer instance """ - self.training_type_plugin.setup(trainer) - - @property - def model(self) -> Module: - """Returns the model. - - This can also be a wrapped LightningModule. For retrieving the pure LightningModule use - :attr:`Accelerator.lightning_module` - """ - return self.training_type_plugin.model - - @model.setter - def model(self, new_model: Module) -> None: - self.training_type_plugin.model = new_model - - @property - def lightning_module(self) -> "pl.LightningModule": - """Returns the pure LightningModule. - - To get the potentially wrapped model use :attr:`Accelerator.model` - """ - return self.training_type_plugin.lightning_module - - @property - def root_device(self) -> torch.device: - """Returns the root device.""" - return self.training_type_plugin.root_device - - def teardown(self) -> None: - """This method is called to teardown the training process. - - It is the right place to release memory and free other resources. - """ - self.training_type_plugin.teardown() def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for a given device. diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 7d5786102d0b3..7831e188f4139 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -29,9 +29,9 @@ def setup(self, trainer: "pl.Trainer") -> None: MisconfigurationException: If the selected device is not CPU. """ - if "cpu" not in str(self.training_type_plugin.root_device): + if "cpu" not in str(self.root_device): raise MisconfigurationException( - f"Device should be CPU, got {self.training_type_plugin.root_device} instead." + f"Device should be CPU, got {self.root_device} instead." ) return super().setup(trainer) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index c6c82d83c32f5..b472fe544f28d 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -37,11 +37,11 @@ def setup_environment(self) -> None: If the selected device is not GPU. """ super().setup_environment() - if "cuda" not in str(self.training_type_plugin.root_device): + if "cuda" not in str(self.root_device): raise MisconfigurationException( - f"Device should be GPU, got {self.training_type_plugin.root_device} instead" + f"Device should be GPU, got {self.root_device} instead" ) - torch.cuda.set_device(self.training_type_plugin.root_device) + torch.cuda.set_device(self.root_device) def setup(self, trainer: "pl.Trainer") -> None: self.set_nvidia_flags(trainer.local_rank) @@ -74,10 +74,6 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: return torch.cuda.memory_stats(device) return get_nvidia_gpu_stats(device) - def teardown(self) -> None: - super().teardown() - self.training_type_plugin._move_optimizer_state(torch.device("cpu")) - @staticmethod def auto_device_count() -> int: """Get the devices when set to auto.""" diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py index 35fe3d053d0d4..4df019d3838cd 100644 --- a/pytorch_lightning/lite/lite.py +++ b/pytorch_lightning/lite/lite.py @@ -398,7 +398,7 @@ def seed_everything(seed: Optional[int] = None, workers: Optional[bool] = None) return seed_everything(seed=seed, workers=workers) def _run_impl(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: - self._accelerator.setup_environment() + self._strategy.setup_environment() # apply sharded context to prevent OOM run_method = partial(self._run_with_sharded_context, run_method) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 829735b0e0bed..a5091a70f09c9 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -147,6 +147,7 @@ def setup_environment(self) -> None: self._call_children_scripts() self.setup_distributed() + super().setup_environment() def _setup_model(self, model: Module) -> DistributedDataParallel: """Wraps the model into a :class:`~torch.nn.parallel.distributed.DistributedDataParallel` module.""" diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 0c7e1f8410e57..46b2f92c45c81 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -23,6 +23,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins import TorchCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -42,8 +43,9 @@ class TrainingTypePlugin(ABC): loop.""" def __init__( - self, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None + self, accelerator: Accelerator, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None ) -> None: + self._accelerator = accelerator self._model: Optional[Module] = None checkpoint_io = checkpoint_io if checkpoint_io is not None else TorchCheckpointIO() self._checkpoint_io = checkpoint_io @@ -57,6 +59,10 @@ def __init__( f" Move your implementation to `{self.__class__.__name__}.teardown()` instead." ) + @property + def accelerator(self) -> Accelerator: + return self._accelerator + @property def checkpoint_io(self) -> CheckpointIO: return self._checkpoint_io @@ -79,6 +85,7 @@ def setup_environment(self) -> None: This is called before the LightningModule/DataModule setup hook which allows the user to access the accelerator environment before setup is complete. """ + self.accelerator.setup_environment() def setup_optimizers(self, trainer: "pl.Trainer") -> None: """Creates optimizers and schedulers. @@ -101,6 +108,7 @@ def setup(self, trainer: "pl.Trainer") -> None: Args: trainer: the trainer instance """ + self.accelerator.setup(trainer) self.setup_optimizers(trainer) self.setup_precision_plugin() @@ -425,6 +433,7 @@ def teardown(self) -> None: It is the right place to release memory and free other resources. """ + self._move_optimizer_state(torch.device("cpu")) @classmethod def register_plugins(cls, plugin_registry) -> None: @@ -437,7 +446,7 @@ def should_rank_save_checkpoint(self) -> bool: def on_train_start(self) -> None: """Called when train begins.""" - pass + self.accelerator.on_train_start() def on_validation_start(self) -> None: """Called when validation begins.""" diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 87154efbd478a..12e84fbf9e375 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -188,7 +188,7 @@ def test_optimization(tmpdir): def test_mixed_precision(tmpdir): class TestCallback(Callback): def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: - assert trainer.accelerator.model.precision == 16 + assert trainer.training_type_plugin.model.precision == 16 raise SystemExit model = IPUModel() @@ -203,8 +203,8 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st def test_pure_half_precision(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: - assert trainer.accelerator.model.precision == 16 - for param in trainer.accelerator.model.parameters(): + assert trainer.training_type_plugin.model.precision == 16 + for param in trainer.training_type_plugin.model.parameters(): assert param.dtype == torch.float16 raise SystemExit diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index fc1ce413cd494..07a1361a57380 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -290,23 +290,23 @@ def forward(self, x): def test_tpu_invalid_raises(): accelerator = TPUAccelerator(object(), TPUSpawnPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): - accelerator.setup(object()) + training_type_plugin.setup(object()) accelerator = TPUAccelerator(TPUPrecisionPlugin(), DDPPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugi"): - accelerator.setup(object()) + training_type_plugin.setup(object()) def test_tpu_invalid_raises_set_precision_with_strategy(): accelerator = TPUAccelerator(object(), TPUSpawnPlugin(precision_plugin=object())) with pytest.raises(ValueError, match="`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): - accelerator.setup(object()) + training_type_plugin.setup(object()) accelerator = TPUAccelerator(None, DDPPlugin(precision_plugin=TPUPrecisionPlugin())) with pytest.raises( ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin" ): - accelerator.setup(object()) + training_type_plugin.setup(object()) @RunIf(tpu=True) diff --git a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py index 6967ea9a12bd7..2a19e646e123f 100644 --- a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py @@ -24,7 +24,7 @@ def test_invalid_on_cpu(tmpdir): ): trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp") assert isinstance(trainer.accelerator.training_type_plugin, DDPFullyShardedPlugin) - trainer.accelerator.setup_environment() + trainer.training_type_plugin.setup_environment() @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index 1aaf89d052686..e99474efd8a7e 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -56,11 +56,11 @@ def test_ddp_with_2_gpus(): class BarrierModel(BoringModel): def setup(self, stage=None): - assert not isinstance(self.trainer.accelerator.model, DistributedDataParallel) + assert not isinstance(self.trainer.training_type_plugin.model, DistributedDataParallel) self.trainer.training_type_plugin.barrier("barrier before model is wrapped") def on_train_start(self): - assert isinstance(self.trainer.accelerator.model, DistributedDataParallel) + assert isinstance(self.trainer.training_type_plugin.model, DistributedDataParallel) self.trainer.training_type_plugin.barrier("barrier after model is wrapped") @@ -110,8 +110,8 @@ def test_ddp_configure_ddp(): # test wrap the model if fitting trainer.state.fn = TrainerFn.FITTING trainer.training_type_plugin.connect(model) - trainer.accelerator.setup_environment() - trainer.accelerator.setup(trainer) + trainer.training_type_plugin.setup_environment() + trainer.training_type_plugin.setup(trainer) trainer.lightning_module.trainer = trainer assert isinstance(trainer.model, LightningModule) trainer._pre_dispatch() @@ -124,8 +124,8 @@ def test_ddp_configure_ddp(): ) # test do not wrap the model if trainerFN is not fitting trainer.training_type_plugin.connect(model) - trainer.accelerator.setup_environment() - trainer.accelerator.setup(trainer) + trainer.training_type_plugin.setup_environment() + trainer.training_type_plugin.setup(trainer) trainer.lightning_module.trainer = trainer trainer._pre_dispatch() # in DDPPlugin configure_ddp(), model are still LightningModule From 8ae530ccb4dc32e79cc1a9fae3ff10b9f90b4597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 7 Dec 2021 13:23:11 +0100 Subject: [PATCH 04/42] wip --- .../plugins/training_type/ddp.py | 2 ++ .../plugins/training_type/parallel.py | 4 +++- .../plugins/training_type/single_device.py | 4 +++- .../plugins/training_type/single_tpu.py | 4 +++- .../training_type/training_type_plugin.py | 8 ++++++- .../connectors/accelerator_connector.py | 23 +++++++++---------- 6 files changed, 29 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index a5091a70f09c9..0c2b6a304508e 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -31,6 +31,7 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward @@ -84,6 +85,7 @@ class DDPPlugin(ParallelPlugin): def __init__( self, + accelerator: Optional[Accelerator] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index 293e52170d4b8..c1e1c7a7907c2 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -20,6 +20,7 @@ from torch.nn.parallel import DistributedDataParallel import pytorch_lightning as pl +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -34,12 +35,13 @@ class ParallelPlugin(TrainingTypePlugin, ABC): def __init__( self, + accelerator: Optional[Accelerator] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ): - super().__init__(checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) + super().__init__(accelerator=accelerator, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) self.parallel_devices = parallel_devices self.cluster_environment = cluster_environment diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 0159e86412cbf..c26d7ff32c354 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -16,6 +16,7 @@ import torch import pytorch_lightning as pl +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin @@ -28,10 +29,11 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__( self, device: torch.device, + accelerator: Optional[Accelerator] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ): - super().__init__(checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) + super().__init__(accelerator=accelerator, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) self.device: torch.device = device self.global_rank = 0 self.local_rank = 0 diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index 011604468e1f5..ffe7f20fb7174 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -15,6 +15,7 @@ from typing import Any, Dict, Optional import pytorch_lightning as pl +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -34,6 +35,7 @@ class SingleTPUPlugin(SingleDevicePlugin): def __init__( self, device: int, + accelerator: Optional[Accelerator] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, debug: bool = False, @@ -41,7 +43,7 @@ def __init__( device = xm.xla_device(device) checkpoint_io = checkpoint_io or XLACheckpointIO() - super().__init__(device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) + super().__init__(accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) self.debug = debug self.tpu_local_core_rank = 0 diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 46b2f92c45c81..13e45127d7c57 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -43,7 +43,7 @@ class TrainingTypePlugin(ABC): loop.""" def __init__( - self, accelerator: Accelerator, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None + self, accelerator: Optional[Accelerator] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None ) -> None: self._accelerator = accelerator self._model: Optional[Module] = None @@ -63,6 +63,12 @@ def __init__( def accelerator(self) -> Accelerator: return self._accelerator + @accelerator.setter + def accelerator(self, accelerator: Accelerator) -> None: + if self._accelerator is not None: + raise ValueError("Accelerator already set.") + self._accelerator = accelerator + @property def checkpoint_io(self) -> CheckpointIO: return self._checkpoint_io diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 18a4da416946d..628f62272b17b 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -163,6 +163,10 @@ def __init__( else: self.set_distributed_mode() + self._validate_accelerator_type() + self._set_devices_if_none() + self.accelerator = self.select_accelerator() + self.handle_given_plugins() self._set_distrib_type_if_training_type_plugin_passed() @@ -170,12 +174,7 @@ def __init__( self.update_device_type_if_ipu_plugin() self.update_device_type_if_training_type_plugin_passed() - - self._validate_accelerator_type() - self._set_devices_if_none() - self._training_type_plugin_resolved = False - self.accelerator = self.select_accelerator() # benchmarking # TODO: should this be moved to GPU accelerator? @@ -698,10 +697,10 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: ): plugin = self.distributed_backend.training_type_plugin elif self.use_ddp2: - plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) + plugin = DDP2Plugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment,) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( - cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices + accelerator=self.accelerator, cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices, ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks() @@ -740,19 +739,19 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( - parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment + accelerator=self.accelerator, parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment, ) elif self.use_dp: - plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) + plugin = DataParallelPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) elif self.use_horovod: - plugin = HorovodPlugin(parallel_devices=self.parallel_devices) + plugin = HorovodPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) elif self.use_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) elif self.use_ipu: - plugin = IPUPlugin(parallel_devices=self.parallel_devices) + plugin = IPUPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) - plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu")) + plugin = SingleDevicePlugin(accelerator=self.accelerator, device=(torch.device(f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu")),) return plugin def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> TrainingTypePlugin: From da00425b0e053272bcfef44eec9bcec264a4b63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 8 Dec 2021 16:10:51 +0100 Subject: [PATCH 05/42] accelerator references --- pytorch_lightning/plugins/training_type/ddp.py | 4 ++-- pytorch_lightning/plugins/training_type/ddp_spawn.py | 2 ++ pytorch_lightning/plugins/training_type/deepspeed.py | 2 ++ pytorch_lightning/plugins/training_type/dp.py | 2 ++ pytorch_lightning/plugins/training_type/fully_sharded.py | 2 ++ pytorch_lightning/plugins/training_type/horovod.py | 2 ++ pytorch_lightning/plugins/training_type/ipu.py | 2 ++ pytorch_lightning/plugins/training_type/tpu_spawn.py | 4 +++- 8 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 0c2b6a304508e..bc1db13d1490f 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -31,7 +31,6 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward @@ -85,7 +84,7 @@ class DDPPlugin(ParallelPlugin): def __init__( self, - accelerator: Optional[Accelerator] = None, + accelerator: Optional["pl.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, @@ -97,6 +96,7 @@ def __init__( **kwargs: Union[Any, Dict[str, Any]], ) -> None: super().__init__( + accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=cluster_environment, checkpoint_io=checkpoint_io, diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 975f4ba435b2d..d433d5a93c0cc 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -62,6 +62,7 @@ class DDPSpawnPlugin(ParallelPlugin): def __init__( self, + accelerator: Optional["pl.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, @@ -72,6 +73,7 @@ def __init__( **kwargs: Any, ): super().__init__( + accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=cluster_environment, checkpoint_io=checkpoint_io, diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index f30d15d495f9f..6aee74f38b004 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -88,6 +88,7 @@ class DeepSpeedPlugin(DDPPlugin): def __init__( self, + accelerator: Optional["pl.Accelerator"] = None, zero_optimization: bool = True, stage: int = 2, remote_device: str = "cpu", @@ -273,6 +274,7 @@ def __init__( ) super().__init__( + accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=cluster_environment, precision_plugin=precision_plugin, diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 69ba2fed867a7..71ee0fc7b9d86 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -35,11 +35,13 @@ class DataParallelPlugin(ParallelPlugin): def __init__( self, + accelerator: Optional["pl.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ): super().__init__( + accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=None, checkpoint_io=checkpoint_io, diff --git a/pytorch_lightning/plugins/training_type/fully_sharded.py b/pytorch_lightning/plugins/training_type/fully_sharded.py index d1b1257622beb..2824fa3e03d33 100644 --- a/pytorch_lightning/plugins/training_type/fully_sharded.py +++ b/pytorch_lightning/plugins/training_type/fully_sharded.py @@ -37,6 +37,7 @@ class DDPFullyShardedPlugin(DDPPlugin): def __init__( self, + accelerator: Optional["pl.Accelerator"] = None, cpu_offload: bool = False, flatten_parameters: bool = True, reshard_after_forward: bool = True, @@ -98,6 +99,7 @@ def __init__( """ super().__init__( + accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=cluster_environment, checkpoint_io=checkpoint_io, diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 184183f5775e3..67ef3b492feb4 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -41,11 +41,13 @@ class HorovodPlugin(ParallelPlugin): def __init__( self, + accelerator: Optional["pl.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ): super().__init__( + accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=None, checkpoint_io=checkpoint_io, diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 2763ad645facb..a9033045c22c0 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -62,6 +62,7 @@ class IPUPlugin(ParallelPlugin): def __init__( self, + accelerator: Optional["pl.Accelerator"] = None, device_iterations: int = 1, autoreport: bool = False, autoreport_dir: Optional[str] = None, @@ -86,6 +87,7 @@ def __init__( created options for validation/testing and predicting. """ super().__init__( + accelerator=accelerator, parallel_devices=parallel_devices, cluster_environment=cluster_environment, checkpoint_io=checkpoint_io, diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 013b73459746f..42a8b838e94d2 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -54,6 +54,7 @@ class TPUSpawnPlugin(DDPSpawnPlugin): def __init__( self, + accelerator: Optional["pl.Accelerator"] = None, parallel_devices: Optional[List[int]] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, @@ -62,7 +63,8 @@ def __init__( ) -> None: checkpoint_io = checkpoint_io or XLACheckpointIO() super().__init__( - parallel_devices=parallel_devices, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin + accelerator=accelerator, + parallel_devices=parallel_devices, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin, ) self.debug = debug self.tpu_local_core_rank = 0 From 7db6742baba5d5bc9172861aa9ad61a7181d32c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 8 Dec 2021 16:33:22 +0100 Subject: [PATCH 06/42] cpu training --- pytorch_lightning/accelerators/cpu.py | 4 ++ .../plugins/training_type/parallel.py | 3 +- .../plugins/training_type/single_device.py | 3 +- .../plugins/training_type/single_tpu.py | 3 +- .../training_type/training_type_plugin.py | 7 ++-- .../connectors/accelerator_connector.py | 39 +++++++------------ pytorch_lightning/trainer/trainer.py | 6 +-- 7 files changed, 28 insertions(+), 37 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 7831e188f4139..da08c296bef16 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -23,6 +23,10 @@ class CPUAccelerator(Accelerator): """Accelerator for CPU devices.""" + @property + def root_device(self): + return torch.device("cpu") + def setup(self, trainer: "pl.Trainer") -> None: """ Raises: diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index c1e1c7a7907c2..8eff90289e199 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -20,7 +20,6 @@ from torch.nn.parallel import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -35,7 +34,7 @@ class ParallelPlugin(TrainingTypePlugin, ABC): def __init__( self, - accelerator: Optional[Accelerator] = None, + accelerator: Optional["pl.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index c26d7ff32c354..288a47b2dde77 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -16,7 +16,6 @@ import torch import pytorch_lightning as pl -from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin @@ -29,7 +28,7 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__( self, device: torch.device, - accelerator: Optional[Accelerator] = None, + accelerator: Optional["pl.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ): diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index ffe7f20fb7174..fde5d36d72e95 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -15,7 +15,6 @@ from typing import Any, Dict, Optional import pytorch_lightning as pl -from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -35,7 +34,7 @@ class SingleTPUPlugin(SingleDevicePlugin): def __init__( self, device: int, - accelerator: Optional[Accelerator] = None, + accelerator: Optional["pl.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, debug: bool = False, diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 13e45127d7c57..6a22c45500c1d 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -23,7 +23,6 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl -from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins import TorchCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -43,7 +42,7 @@ class TrainingTypePlugin(ABC): loop.""" def __init__( - self, accelerator: Optional[Accelerator] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None + self, accelerator: Optional["pl.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None ) -> None: self._accelerator = accelerator self._model: Optional[Module] = None @@ -60,11 +59,11 @@ def __init__( ) @property - def accelerator(self) -> Accelerator: + def accelerator(self) -> "pl.Accelerator": return self._accelerator @accelerator.setter - def accelerator(self, accelerator: Accelerator) -> None: + def accelerator(self, accelerator: "pl.Accelerator") -> None: if self._accelerator is not None: raise ValueError("Accelerator already set.") self._accelerator = accelerator diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 628f62272b17b..c98ca2fa9f1e5 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -156,16 +156,13 @@ def __init__( self._warn_if_devices_flag_ignored() - self.select_accelerator_type() - - if self.strategy is not None: - self._set_training_type_plugin() - else: - self.set_distributed_mode() + self.set_distributed_mode() + self.accelerator = self.select_accelerator() + self.select_training_type_plugin() + self.select_accelerator_type() self._validate_accelerator_type() self._set_devices_if_none() - self.accelerator = self.select_accelerator() self.handle_given_plugins() self._set_distrib_type_if_training_type_plugin_passed() @@ -395,11 +392,15 @@ def precision_plugin(self) -> PrecisionPlugin: @property def training_type_plugin(self) -> TrainingTypePlugin: - if self._training_type_plugin_resolved: - # avoid calling `resolve_training_type_plugin` multiple times - return self._training_type_plugin if self._training_type_plugin is None: - self._training_type_plugin = self.select_training_type_plugin() + raise TypeError("Tried to access TTP before initialization finished") + return self._training_type_plugin + + def select_training_type_plugin(self): + self._set_training_type_plugin() + + if self._training_type_plugin is None: + self._training_type_plugin = self.create_training_type_plugin() self._training_type_plugin = self.resolve_training_type_plugin(self._training_type_plugin) # attach checkpoint plugin to the training type plugin if self._checkpoint_io is not None: @@ -407,8 +408,6 @@ def training_type_plugin(self) -> TrainingTypePlugin: precision_plugin = self.precision_plugin if precision_plugin is not None: self._training_type_plugin._precision_plugin = precision_plugin - self._training_type_plugin_resolved = True - return self._training_type_plugin @property @@ -690,13 +689,8 @@ def select_precision_plugin(self) -> PrecisionPlugin: raise RuntimeError("No precision set") - def select_training_type_plugin(self) -> TrainingTypePlugin: - if ( - isinstance(self.distributed_backend, Accelerator) - and self.distributed_backend.training_type_plugin is not None - ): - plugin = self.distributed_backend.training_type_plugin - elif self.use_ddp2: + def create_training_type_plugin(self) -> TrainingTypePlugin: + if self.use_ddp2: plugin = DDP2Plugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment,) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( @@ -796,10 +790,7 @@ def select_accelerator(self) -> Accelerator: else: acc_cls = CPUAccelerator - accelerator = acc_cls(precision_plugin=None, training_type_plugin=self.training_type_plugin) - # transfer ownership of the plugins to the accelerator - self._training_type_plugin = proxy(self.training_type_plugin) - + accelerator = acc_cls() return accelerator def select_cluster_environment(self) -> ClusterEnvironment: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0455de7c278bd..29cd6f0199bcb 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1670,11 +1670,11 @@ def _on_exception(self) -> None: @property def accelerator(self) -> Accelerator: - return self._accelerator_connector.accelerator + return self.training_type_plugin.accelerator @property def training_type_plugin(self) -> TrainingTypePlugin: - return self.accelerator.training_type_plugin + return self._accelerator_connector.training_type_plugin @property def precision_plugin(self) -> PrecisionPlugin: @@ -1748,7 +1748,7 @@ def data_parallel_device_ids(self) -> Optional[List[int]]: @property def lightning_module(self) -> "pl.LightningModule": - return self.accelerator.lightning_module + return self.training_type_plugin.lightning_module @property def optimizers(self) -> List[Optimizer]: From 8c7fc955e84e0932e258e762e1a21ece37ff4f5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 8 Dec 2021 11:38:46 -0500 Subject: [PATCH 07/42] rename occurrences in tests --- pl_examples/bug_report/bug_report_model.py | 3 +++ .../test_accelerator_connector.py | 4 +-- tests/accelerators/test_cpu.py | 23 +++++++++------- tests/accelerators/test_ipu.py | 26 +++++++++---------- ..._ddp_fully_sharded_with_full_state_dict.py | 2 +- .../plugins/test_ddp_plugin_with_comm_hook.py | 8 +++--- tests/plugins/test_deepspeed_plugin.py | 16 ++++++------ tests/plugins/test_sharded_plugin.py | 4 +-- 8 files changed, 47 insertions(+), 39 deletions(-) diff --git a/pl_examples/bug_report/bug_report_model.py b/pl_examples/bug_report/bug_report_model.py index 7739630237d32..44a1f136c13a0 100644 --- a/pl_examples/bug_report/bug_report_model.py +++ b/pl_examples/bug_report/bug_report_model.py @@ -57,6 +57,9 @@ def run(): num_sanity_val_steps=0, max_epochs=1, enable_model_summary=False, + accelerator="cpu", + strategy="ddp", + devices=2, ) trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data) trainer.test(model, dataloaders=test_data) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 51316c155368c..830cd9a51ffb1 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -483,11 +483,11 @@ def test_plugin_accelerator_choice(accelerator: Optional[str], plugin: str): else: with pytest.deprecated_call(match=r"accelerator=.*\)` has been deprecated"): trainer = Trainer(accelerator=accelerator, plugins=plugin, num_processes=2) - assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) + assert isinstance(trainer.training_type_plugin, DDPShardedPlugin) with pytest.deprecated_call(match="Passing .* `strategy` to the `plugins`"): trainer = Trainer(plugins=plugin, num_processes=2) - assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) + assert isinstance(trainer.training_type_plugin, DDPShardedPlugin) @pytest.mark.parametrize( diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py index 553d842ed186e..566952185252c 100644 --- a/tests/accelerators/test_cpu.py +++ b/tests/accelerators/test_cpu.py @@ -16,9 +16,12 @@ def test_restore_checkpoint_after_pre_dispatch_default(): """Assert default for restore_checkpoint_after_pre_dispatch is False.""" - plugin = SingleDevicePlugin(torch.device("cpu")) - accelerator = CPUAccelerator(training_type_plugin=plugin, precision_plugin=PrecisionPlugin()) - assert not accelerator.training_type_plugin.restore_checkpoint_after_pre_dispatch + plugin = SingleDevicePlugin( + accelerator=CPUAccelerator(), + device=torch.device("cpu"), + precision_plugin=PrecisionPlugin() + ) + assert not plugin.restore_checkpoint_after_pre_dispatch assert not plugin.restore_checkpoint_after_pre_dispatch @@ -49,14 +52,16 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: checkpoint_path = os.path.join(tmpdir, "model.pt") trainer.save_checkpoint(checkpoint_path) - plugin = TestPlugin(torch.device("cpu"), checkpoint_io=TorchCheckpointIO()) - accelerator = CPUAccelerator(training_type_plugin=plugin, precision_plugin=PrecisionPlugin()) - - assert accelerator.training_type_plugin.restore_checkpoint_after_pre_dispatch == restore_after_pre_dispatch + plugin = TestPlugin( + accelerator=CPUAccelerator(), + precision_plugin=PrecisionPlugin(), + device=torch.device("cpu"), + checkpoint_io=TorchCheckpointIO() + ) assert plugin.restore_checkpoint_after_pre_dispatch == restore_after_pre_dispatch - trainer = Trainer(default_root_dir=tmpdir, accelerator=accelerator, fast_dev_run=True) + trainer = Trainer(default_root_dir=tmpdir, strategy=plugin, fast_dev_run=True) trainer.fit(model, ckpt_path=checkpoint_path) for func in (trainer.test, trainer.validate, trainer.predict): - accelerator.training_type_plugin.predispatched_called = False + plugin.predispatched_called = False func(model, ckpt_path=checkpoint_path) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 12e84fbf9e375..d404db964a87d 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -212,7 +212,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: model = model.half() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert isinstance(trainer.training_type_plugin, IPUPlugin) assert isinstance(trainer.training_type_plugin.precision_plugin, IPUPrecisionPlugin) assert trainer.training_type_plugin.precision_plugin.precision == 16 @@ -224,9 +224,9 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: def test_device_iterations_ipu_plugin(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: - assert trainer.accelerator.training_type_plugin.device_iterations == 2 + assert trainer.training_type_plugin.device_iterations == 2 # assert device iterations has been set correctly within the poptorch options - poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] + poptorch_model = trainer.training_type_plugin.poptorch_models[RunningStage.TRAINING] assert poptorch_model._options.toDict()["device_iterations"] == 2 raise SystemExit @@ -238,7 +238,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: strategy=IPUPlugin(device_iterations=2), callbacks=TestCallback(), ) - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert isinstance(trainer.training_type_plugin, IPUPlugin) with pytest.raises(SystemExit): trainer.fit(model) @@ -251,7 +251,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: # since ipu handle accumulation assert trainer.accumulation_scheduler.scheduling == {0: 1} # assert poptorch option have been set correctly - poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] + poptorch_model = trainer.training_type_plugin.poptorch_models[RunningStage.TRAINING] assert poptorch_model._options.Training.toDict()["gradient_accumulation"] == 2 raise SystemExit @@ -356,9 +356,9 @@ def test_manual_poptorch_opts(tmpdir): ) trainer.fit(model) - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) - assert trainer.accelerator.training_type_plugin.training_opts == training_opts - assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts + assert isinstance(trainer.training_type_plugin, IPUPlugin) + assert trainer.training_type_plugin.training_opts == training_opts + assert trainer.training_type_plugin.inference_opts == inference_opts @RunIf(ipu=True) @@ -380,7 +380,7 @@ def test_manual_poptorch_opts_custom(tmpdir): class TestCallback(Callback): def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None: # ensure dataloaders were correctly set up during training. - plugin = trainer.accelerator.training_type_plugin + plugin = trainer.training_type_plugin assert isinstance(plugin, IPUPlugin) assert plugin.training_opts.replication_factor == 2 assert plugin.inference_opts.replication_factor == 1 @@ -400,7 +400,7 @@ def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None: trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy=plugin, callbacks=TestCallback()) trainer.fit(model) - plugin = trainer.accelerator.training_type_plugin + plugin = trainer.training_type_plugin assert isinstance(plugin, IPUPlugin) training_opts = plugin.training_opts @@ -462,9 +462,9 @@ def test_default_opts(tmpdir): trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True) trainer.fit(model) - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) - inference_opts = trainer.accelerator.training_type_plugin.inference_opts - training_opts = trainer.accelerator.training_type_plugin.training_opts + assert isinstance(trainer.training_type_plugin, IPUPlugin) + inference_opts = trainer.training_type_plugin.inference_opts + training_opts = trainer.training_type_plugin.training_opts for opts in (inference_opts, training_opts): assert isinstance(opts, poptorch.Options) assert opts.Training.gradient_accumulation == 1 diff --git a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py index 2a19e646e123f..c4a2eeaf74c0b 100644 --- a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py @@ -23,7 +23,7 @@ def test_invalid_on_cpu(tmpdir): MisconfigurationException, match="You selected accelerator to be `ddp_fully_sharded`, but GPU is not available." ): trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp") - assert isinstance(trainer.accelerator.training_type_plugin, DDPFullyShardedPlugin) + assert isinstance(trainer.training_type_plugin, DDPFullyShardedPlugin) trainer.training_type_plugin.setup_environment() diff --git a/tests/plugins/test_ddp_plugin_with_comm_hook.py b/tests/plugins/test_ddp_plugin_with_comm_hook.py index b67988b3efecf..69d320b52d426 100644 --- a/tests/plugins/test_ddp_plugin_with_comm_hook.py +++ b/tests/plugins/test_ddp_plugin_with_comm_hook.py @@ -40,7 +40,7 @@ def test_ddp_fp16_compress_comm_hook(tmpdir): fast_dev_run=True, ) trainer.fit(model) - trainer_comm_hook = trainer.accelerator.training_type_plugin.model.get_ddp_logging_data().comm_hook + trainer_comm_hook = trainer.training_type_plugin.model.get_ddp_logging_data().comm_hook expected_comm_hook = default.fp16_compress_hook.__qualname__ assert trainer_comm_hook == expected_comm_hook assert trainer.state.finished, f"Training failed with {trainer.state}" @@ -63,7 +63,7 @@ def test_ddp_sgd_comm_hook(tmpdir): fast_dev_run=True, ) trainer.fit(model) - trainer_comm_hook = trainer.accelerator.training_type_plugin.model.get_ddp_logging_data().comm_hook + trainer_comm_hook = trainer.training_type_plugin.model.get_ddp_logging_data().comm_hook expected_comm_hook = powerSGD.powerSGD_hook.__qualname__ assert trainer_comm_hook == expected_comm_hook assert trainer.state.finished, f"Training failed with {trainer.state}" @@ -87,7 +87,7 @@ def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir): fast_dev_run=True, ) trainer.fit(model) - trainer_comm_hook = trainer.accelerator.training_type_plugin.model.get_ddp_logging_data().comm_hook + trainer_comm_hook = trainer.training_type_plugin.model.get_ddp_logging_data().comm_hook expected_comm_hook = default.fp16_compress_wrapper(powerSGD.powerSGD_hook).__qualname__ assert trainer_comm_hook == expected_comm_hook assert trainer.state.finished, f"Training failed with {trainer.state}" @@ -132,7 +132,7 @@ def test_ddp_post_local_sgd_comm_hook(tmpdir): sync_batchnorm=True, ) trainer.fit(model) - trainer_comm_hook = trainer.accelerator.training_type_plugin.model.get_ddp_logging_data().comm_hook + trainer_comm_hook = trainer.training_type_plugin.model.get_ddp_logging_data().comm_hook expected_comm_hook = post_localSGD.post_localSGD_hook.__qualname__ assert trainer_comm_hook == expected_comm_hook assert trainer.state.finished, f"Training failed with {trainer.state}" diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 4b56e9d389174..73ff8795a086d 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -133,8 +133,8 @@ def test_deepspeed_plugin_string(tmpdir, plugin): fast_dev_run=True, default_root_dir=tmpdir, strategy=plugin if isinstance(plugin, str) else plugin() ) - assert isinstance(trainer.accelerator.training_type_plugin, DeepSpeedPlugin) - assert trainer.accelerator.training_type_plugin.parallel_devices == [torch.device("cpu")] + assert isinstance(trainer.training_type_plugin, DeepSpeedPlugin) + assert trainer.training_type_plugin.parallel_devices == [torch.device("cpu")] @RunIf(deepspeed=True) @@ -147,7 +147,7 @@ def test_deepspeed_plugin_env(tmpdir, monkeypatch, deepspeed_config): trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, strategy="deepspeed") - plugin = trainer.accelerator.training_type_plugin + plugin = trainer.training_type_plugin assert isinstance(plugin, DeepSpeedPlugin) assert plugin.parallel_devices == [torch.device("cpu")] assert plugin.config == deepspeed_config @@ -169,7 +169,7 @@ def test_deepspeed_precision_choice(amp_backend, precision, tmpdir): fast_dev_run=True, default_root_dir=tmpdir, strategy="deepspeed", amp_backend=amp_backend, precision=precision ) - assert isinstance(trainer.accelerator.training_type_plugin, DeepSpeedPlugin) + assert isinstance(trainer.training_type_plugin, DeepSpeedPlugin) assert isinstance(trainer.training_type_plugin.precision_plugin, DeepSpeedPrecisionPlugin) assert trainer.training_type_plugin.precision_plugin.precision == precision @@ -235,8 +235,8 @@ def train_dataloader(self): class AssertCallback(Callback): def setup(self, trainer, pl_module, stage: Optional[str] = None) -> None: - assert isinstance(trainer.accelerator.training_type_plugin, DeepSpeedPlugin) - config = trainer.accelerator.training_type_plugin.config + assert isinstance(trainer.training_type_plugin, DeepSpeedPlugin) + config = trainer.training_type_plugin.config # int value overrides auto mode expected_value = value if isinstance(value, int) else 1 @@ -688,8 +688,8 @@ class TestCallback(Callback): def on_train_batch_start( self, trainer: Trainer, pl_module: LightningModule, batch: Any, batch_idx: int ) -> None: - original_deepspeed_plugin = initial_trainer.accelerator.training_type_plugin - current_deepspeed_plugin = trainer.accelerator.training_type_plugin + original_deepspeed_plugin = initial_trainer.training_type_plugin + current_deepspeed_plugin = trainer.training_type_plugin assert isinstance(original_deepspeed_plugin, DeepSpeedPlugin) assert isinstance(current_deepspeed_plugin, DeepSpeedPlugin) diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index e3b7e4986d9fb..9d7a72507b273 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -37,7 +37,7 @@ def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_v def test_sharded_ddp_choice(tmpdir, strategy, expected): """Test to ensure that plugin is correctly chosen.""" trainer = Trainer(fast_dev_run=True, strategy=strategy) - assert isinstance(trainer.accelerator.training_type_plugin, expected) + assert isinstance(trainer.training_type_plugin, expected) @RunIf(min_gpus=1, fairscale=True) @@ -47,7 +47,7 @@ def test_sharded_ddp_choice(tmpdir, strategy, expected): def test_ddp_choice_sharded_amp(tmpdir, strategy, expected): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" trainer = Trainer(fast_dev_run=True, gpus=1, precision=16, strategy=strategy) - assert isinstance(trainer.accelerator.training_type_plugin, expected) + assert isinstance(trainer.training_type_plugin, expected) @RunIf(skip_windows=True, fairscale=True) From 4afbf5c70cdfaa2de84ee00efb8c5151a5cf5eba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 8 Dec 2021 11:47:37 -0500 Subject: [PATCH 08/42] update tests --- tests/accelerators/test_gpu.py | 10 ++-------- tests/accelerators/test_tpu.py | 10 +++++----- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py index 85ce0cd9f0f18..fc01c932b2635 100644 --- a/tests/accelerators/test_gpu.py +++ b/tests/accelerators/test_gpu.py @@ -11,10 +11,7 @@ def test_get_torch_gpu_stats(tmpdir): """Test GPU get_device_stats with Pytorch >= 1.8.0.""" current_device = torch.device(f"cuda:{torch.cuda.current_device()}") - GPUAccel = GPUAccelerator( - training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin() - ) - gpu_stats = GPUAccel.get_device_stats(current_device) + gpu_stats = GPUAccelerator().get_device_stats(current_device) fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"] for f in fields: @@ -26,10 +23,7 @@ def test_get_torch_gpu_stats(tmpdir): def test_get_nvidia_gpu_stats(tmpdir): """Test GPU get_device_stats with Pytorch < 1.8.0.""" current_device = torch.device(f"cuda:{torch.cuda.current_device()}") - GPUAccel = GPUAccelerator( - training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin() - ) - gpu_stats = GPUAccel.get_device_stats(current_device) + gpu_stats = GPUAccelerator().get_device_stats(current_device) fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] for f in fields: diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index 07a1361a57380..5883e712c12fd 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -13,7 +13,7 @@ # limitations under the License import collections from copy import deepcopy -from unittest.mock import patch +from unittest.mock import patch, Mock import pytest import torch @@ -288,13 +288,13 @@ def forward(self, x): def test_tpu_invalid_raises(): - accelerator = TPUAccelerator(object(), TPUSpawnPlugin()) + training_type_plugin = TPUSpawnPlugin(accelerator=TPUAccelerator(), precision_plugin=Mock()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): - training_type_plugin.setup(object()) + training_type_plugin.setup(Mock()) - accelerator = TPUAccelerator(TPUPrecisionPlugin(), DDPPlugin()) + training_type_plugin = DDPPlugin(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugi"): - training_type_plugin.setup(object()) + training_type_plugin.setup(Mock()) def test_tpu_invalid_raises_set_precision_with_strategy(): From 8fdce970ab3261b34d21d30f20fdbac90e070450 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 9 Dec 2021 16:20:13 -0800 Subject: [PATCH 09/42] pull from adrian's commit --- pytorch_lightning/accelerators/cpu.py | 4 +--- pytorch_lightning/accelerators/gpu.py | 4 +--- .../plugins/training_type/single_tpu.py | 4 +++- .../plugins/training_type/tpu_spawn.py | 4 +++- .../training_type/training_type_plugin.py | 5 ++++- .../connectors/accelerator_connector.py | 19 +++++++++++++++---- tests/accelerators/test_tpu.py | 2 +- 7 files changed, 28 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index da08c296bef16..c7db059c9f31a 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -34,9 +34,7 @@ def setup(self, trainer: "pl.Trainer") -> None: If the selected device is not CPU. """ if "cpu" not in str(self.root_device): - raise MisconfigurationException( - f"Device should be CPU, got {self.root_device} instead." - ) + raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.") return super().setup(trainer) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index b472fe544f28d..6f1337f40855c 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -38,9 +38,7 @@ def setup_environment(self) -> None: """ super().setup_environment() if "cuda" not in str(self.root_device): - raise MisconfigurationException( - f"Device should be GPU, got {self.root_device} instead" - ) + raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") torch.cuda.set_device(self.root_device) def setup(self, trainer: "pl.Trainer") -> None: diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index fde5d36d72e95..aa6c508f2bc7b 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -42,7 +42,9 @@ def __init__( device = xm.xla_device(device) checkpoint_io = checkpoint_io or XLACheckpointIO() - super().__init__(accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) + super().__init__( + accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin + ) self.debug = debug self.tpu_local_core_rank = 0 diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 42a8b838e94d2..f4215e08663c9 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -64,7 +64,9 @@ def __init__( checkpoint_io = checkpoint_io or XLACheckpointIO() super().__init__( accelerator=accelerator, - parallel_devices=parallel_devices, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin, + parallel_devices=parallel_devices, + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, ) self.debug = debug self.tpu_local_core_rank = 0 diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 6a22c45500c1d..043065bf4fa13 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -42,7 +42,10 @@ class TrainingTypePlugin(ABC): loop.""" def __init__( - self, accelerator: Optional["pl.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None + self, + accelerator: Optional["pl.Accelerator"] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[PrecisionPlugin] = None, ) -> None: self._accelerator = accelerator self._model: Optional[Module] = None diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index c98ca2fa9f1e5..5219b061cbbaf 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -691,10 +691,16 @@ def select_precision_plugin(self) -> PrecisionPlugin: def create_training_type_plugin(self) -> TrainingTypePlugin: if self.use_ddp2: - plugin = DDP2Plugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment,) + plugin = DDP2Plugin( + accelerator=self.accelerator, + parallel_devices=self.parallel_devices, + cluster_environment=self.cluster_environment, + ) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( - accelerator=self.accelerator, cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices, + accelerator=self.accelerator, + cluster_environment=self.select_cluster_environment(), + parallel_devices=self.parallel_devices, ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks() @@ -733,7 +739,9 @@ def create_training_type_plugin(self) -> TrainingTypePlugin: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( - accelerator=self.accelerator, parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment, + accelerator=self.accelerator, + parallel_devices=self.parallel_devices, + cluster_environment=self.cluster_environment, ) elif self.use_dp: plugin = DataParallelPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) @@ -745,7 +753,10 @@ def create_training_type_plugin(self) -> TrainingTypePlugin: plugin = IPUPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) - plugin = SingleDevicePlugin(accelerator=self.accelerator, device=(torch.device(f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu")),) + plugin = SingleDevicePlugin( + accelerator=self.accelerator, + device=(torch.device(f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu")), + ) return plugin def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> TrainingTypePlugin: diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index 5883e712c12fd..ef20df39d6acc 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -13,7 +13,7 @@ # limitations under the License import collections from copy import deepcopy -from unittest.mock import patch, Mock +from unittest.mock import Mock, patch import pytest import torch From 1c7bf4d4b3d6e04439aac086c775fca08ba38140 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 9 Dec 2021 16:54:32 -0800 Subject: [PATCH 10/42] fix changelog merge pro --- CHANGELOG.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 319159d871f72..8f95ebaebac6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -255,19 +255,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed method `setup_optimizers_in_pre_dispatch` from the `strategies` and achieve the same logic in `setup` and `pre_dispatch` methods ([#10906](https://github.com/PyTorchLightning/pytorch-lightning/pull/10906)) -<<<<<<< HEAD - Removed methods `pre_dispatch`, `dispatch` and `post_dispatch` from the `Accelerator` ([#10885](https://github.com/PyTorchLightning/pytorch-lightning/pull/10885)) -- Removed method `training_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) -======= -- Removed method `training_step`, `test_step`, `validation_step` and `predict_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) ->>>>>>> c593c9f0f (remove test, val, predict step) - - -### Fixed - - - Removed method `training_step`, `test_step`, `validation_step` and `predict_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) From 59920f7301baee71e321a89f11bec90ad3ee3c75 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 9 Dec 2021 18:52:50 -0800 Subject: [PATCH 11/42] fix accelerator_connector and other updates --- pytorch_lightning/accelerators/accelerator.py | 12 +-- pytorch_lightning/accelerators/cpu.py | 28 +++--- pytorch_lightning/accelerators/gpu.py | 22 ++--- pytorch_lightning/accelerators/tpu.py | 45 +++++----- .../training_type/training_type_plugin.py | 6 +- .../connectors/accelerator_connector.py | 85 +++++++++---------- pytorch_lightning/trainer/trainer.py | 13 +-- .../test_accelerator_connector.py | 36 ++++---- 8 files changed, 119 insertions(+), 128 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 9557781d084f5..a7122f4331fa9 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -12,14 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import abstractmethod -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Union import torch -from torch.nn import Module import pytorch_lightning as pl -from pytorch_lightning.plugins.precision import PrecisionPlugin -from pytorch_lightning.plugins.training_type import TrainingTypePlugin class Accelerator: @@ -33,13 +30,6 @@ class Accelerator: - IPU """ - def __init__(self) -> None: - super().__init__() - - @property - def root_device(self): - return None - def setup_environment(self) -> None: """Setup any processes or distributed connections. diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index c7db059c9f31a..0608f12381ea1 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -15,28 +15,28 @@ import torch -import pytorch_lightning as pl +# import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities.exceptions import MisconfigurationException +# from pytorch_lightning.utilities.exceptions import MisconfigurationException class CPUAccelerator(Accelerator): """Accelerator for CPU devices.""" - @property - def root_device(self): - return torch.device("cpu") + # @property + # def root_device(self): + # return torch.device("cpu") - def setup(self, trainer: "pl.Trainer") -> None: - """ - Raises: - MisconfigurationException: - If the selected device is not CPU. - """ - if "cpu" not in str(self.root_device): - raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.") + # def setup(self, trainer: "pl.Trainer") -> None: + # """ + # Raises: + # MisconfigurationException: + # If the selected device is not CPU. + # """ + # if "cpu" not in str(self.root_device): + # raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.") - return super().setup(trainer) + # return super().setup(trainer) def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """CPU device stats aren't supported yet.""" diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 6f1337f40855c..a9b2e783d9abc 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -21,7 +21,8 @@ import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities.exceptions import MisconfigurationException + +# from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 _log = logging.getLogger(__name__) @@ -30,16 +31,15 @@ class GPUAccelerator(Accelerator): """Accelerator for GPU devices.""" - def setup_environment(self) -> None: - """ - Raises: - MisconfigurationException: - If the selected device is not GPU. - """ - super().setup_environment() - if "cuda" not in str(self.root_device): - raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") - torch.cuda.set_device(self.root_device) + # def setup_environment(self) -> None: + # """ + # Raises: + # MisconfigurationException: + # If the selected device is not GPU. + # """ + # if "cuda" not in str(self.root_device): + # raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") + # torch.cuda.set_device(self.root_device) def setup(self, trainer: "pl.Trainer") -> None: self.set_nvidia_flags(trainer.local_rank) diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index f116ed7f0f493..e0f633c0015c0 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -15,11 +15,12 @@ import torch -import pytorch_lightning as pl +# import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.plugins.precision import TPUPrecisionPlugin -from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin -from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin + +# from pytorch_lightning.plugins.precision import TPUPrecisionPlugin +# from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin +# from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin from pytorch_lightning.utilities import _XLA_AVAILABLE if _XLA_AVAILABLE: @@ -29,24 +30,24 @@ class TPUAccelerator(Accelerator): """Accelerator for TPU devices.""" - def setup(self, trainer: "pl.Trainer") -> None: - """ - Raises: - ValueError: - If the precision or training type plugin are unsupported. - """ - if not isinstance(self.training_type_plugin.precision_plugin, TPUPrecisionPlugin): - # this configuration should have been avoided in the accelerator connector - raise ValueError( - f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," - f" found: {self.training_type_plugin.precision_plugin}." - ) - if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)): - raise ValueError( - "The `TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin," - f" found {self.training_type_plugin}." - ) - return super().setup(trainer) + # def setup(self, trainer: "pl.Trainer") -> None: + # """ + # Raises: + # ValueError: + # If the precision or training type plugin are unsupported. + # """ + # if not isinstance(self.training_type_plugin.precision_plugin, TPUPrecisionPlugin): + # # this configuration should have been avoided in the accelerator connector + # raise ValueError( + # f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," + # f" found: {self.training_type_plugin.precision_plugin}." + # ) + # if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)): + # raise ValueError( + # "The `TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin," + # f" found {self.training_type_plugin}." + # ) + # return super().setup(trainer) def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given TPU device. diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 043065bf4fa13..335a4edb4754e 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -67,8 +67,8 @@ def accelerator(self) -> "pl.Accelerator": @accelerator.setter def accelerator(self, accelerator: "pl.Accelerator") -> None: - if self._accelerator is not None: - raise ValueError("Accelerator already set.") + # if self._accelerator is not None: + # raise ValueError("Accelerator already set.") self._accelerator = accelerator @property @@ -454,7 +454,7 @@ def should_rank_save_checkpoint(self) -> bool: def on_train_start(self) -> None: """Called when train begins.""" - self.accelerator.on_train_start() + pass def on_validation_start(self) -> None: """Called when validation begins.""" diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 5219b061cbbaf..0e8e798e5eb86 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -156,13 +156,12 @@ def __init__( self._warn_if_devices_flag_ignored() - self.set_distributed_mode() - self.accelerator = self.select_accelerator() - self.select_training_type_plugin() - self.select_accelerator_type() - self._validate_accelerator_type() - self._set_devices_if_none() + + if self.strategy is not None: + self._set_training_type_plugin() + else: + self.set_distributed_mode() self.handle_given_plugins() self._set_distrib_type_if_training_type_plugin_passed() @@ -171,7 +170,13 @@ def __init__( self.update_device_type_if_ipu_plugin() self.update_device_type_if_training_type_plugin_passed() + + self._validate_accelerator_type() + self._set_devices_if_none() + self._training_type_plugin_resolved = False + self.training_type_plugin = self.final_training_type_plugin() + self.accelerator = self.training_type_plugin.accelerator # benchmarking # TODO: should this be moved to GPU accelerator? @@ -390,17 +395,12 @@ def precision_plugin(self) -> PrecisionPlugin: self._precision_plugin = self.select_precision_plugin() return self._precision_plugin - @property - def training_type_plugin(self) -> TrainingTypePlugin: + def final_training_type_plugin(self) -> TrainingTypePlugin: + if self._training_type_plugin_resolved: + # avoid calling `resolve_training_type_plugin` multiple times + return self._training_type_plugin if self._training_type_plugin is None: - raise TypeError("Tried to access TTP before initialization finished") - return self._training_type_plugin - - def select_training_type_plugin(self): - self._set_training_type_plugin() - - if self._training_type_plugin is None: - self._training_type_plugin = self.create_training_type_plugin() + self._training_type_plugin = self.select_training_type_plugin() self._training_type_plugin = self.resolve_training_type_plugin(self._training_type_plugin) # attach checkpoint plugin to the training type plugin if self._checkpoint_io is not None: @@ -408,6 +408,9 @@ def select_training_type_plugin(self): precision_plugin = self.precision_plugin if precision_plugin is not None: self._training_type_plugin._precision_plugin = precision_plugin + self._training_type_plugin_resolved = True + + self._training_type_plugin.accelerator = self.select_accelerator() return self._training_type_plugin @property @@ -689,18 +692,17 @@ def select_precision_plugin(self) -> PrecisionPlugin: raise RuntimeError("No precision set") - def create_training_type_plugin(self) -> TrainingTypePlugin: - if self.use_ddp2: - plugin = DDP2Plugin( - accelerator=self.accelerator, - parallel_devices=self.parallel_devices, - cluster_environment=self.cluster_environment, - ) + def select_training_type_plugin(self) -> TrainingTypePlugin: + if ( + isinstance(self.distributed_backend, Accelerator) + and self.distributed_backend.training_type_plugin is not None + ): + plugin = self.distributed_backend.training_type_plugin + elif self.use_ddp2: + plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment) elif self.use_ddp and self.use_deepspeed: plugin = DeepSpeedPlugin( - accelerator=self.accelerator, - cluster_environment=self.select_cluster_environment(), - parallel_devices=self.parallel_devices, + cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices ) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks() @@ -739,24 +741,19 @@ def create_training_type_plugin(self) -> TrainingTypePlugin: ddp_plugin_cls = DDPPlugin plugin = ddp_plugin_cls( - accelerator=self.accelerator, - parallel_devices=self.parallel_devices, - cluster_environment=self.cluster_environment, + parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment ) elif self.use_dp: - plugin = DataParallelPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) + plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: - plugin = HorovodPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) + plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.use_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) elif self.use_ipu: - plugin = IPUPlugin(accelerator=self.accelerator, parallel_devices=self.parallel_devices) + plugin = IPUPlugin(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) - plugin = SingleDevicePlugin( - accelerator=self.accelerator, - device=(torch.device(f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu")), - ) + plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.use_gpu else "cpu")) return plugin def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> TrainingTypePlugin: @@ -802,6 +799,9 @@ def select_accelerator(self) -> Accelerator: acc_cls = CPUAccelerator accelerator = acc_cls() + # transfer ownership of the plugins to the accelerator + # self._training_type_plugin = proxy(self.training_type_plugin) + return accelerator def select_cluster_environment(self) -> ClusterEnvironment: @@ -998,14 +998,13 @@ def _set_distrib_type_if_training_type_plugin_passed(self): self._distrib_type = getattr(self._training_type_plugin, "distributed_backend", None) def _is_slurm_managing_tasks(self) -> bool: - """Returns whether we let SLURM manage the processes or not. - - Returns ``True`` if and only if these conditions match: + """Returns whether we let SLURM manage the processes or not. Returns ``True`` if and only if these + conditions match: - - A SLURM cluster is detected - - A distributed plugin is being used - - The process is not launching in interactive mode - - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer + - A SLURM cluster is detected + - A distributed plugin is being used + - The process is not launching in interactive mode + - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer """ if ( (not self.use_ddp and not self.use_ddp2) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 29cd6f0199bcb..8883d1862b801 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -451,6 +451,7 @@ def __init__( amp_level, plugins, ) + self._accelerator_connector.training_type_plugin self.logger_connector = LoggerConnector(self, log_gpu_memory) self._callback_connector = CallbackConnector(self) self.checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint) @@ -1106,7 +1107,7 @@ def _run( # SET UP TRAINING # ---------------------------- self._call_callback_hooks("on_before_accelerator_backend_setup") - self.accelerator.setup_environment() + self.training_type_plugin.setup_environment() self._call_setup_hook() # allow user to setup lightning_module in accelerator environment # check if we should delay restoring checkpoint till later @@ -1114,7 +1115,7 @@ def _run( self._restore_modules_and_callbacks(ckpt_path) self._call_configure_sharded_model() # allow user to setup in model sharded environment - self.accelerator.setup(self) + self.training_type_plugin.setup(self) # ---------------------------- # INSPECT THE CORE LOOPS @@ -1124,7 +1125,7 @@ def _run( {Trainer.fit} or {Trainer.test} or {Trainer.predict} || | || spawn processes || - {self.accelerator.setup_environment} || + {self.training_type_plugin.setup_environment} || | || setup accelerator || and strategy || LIGHTNING @@ -1231,7 +1232,7 @@ def _teardown(self): """This is the Trainer's internal teardown, unrelated to the `teardown` hooks in LightningModule and Callback; those are handled by :meth:`_call_teardown_hook`.""" self.training_type_plugin.post_dispatch(self) - self.accelerator.teardown() + self.training_type_plugin.teardown() self._data_connector.teardown() self._active_loop.teardown() self.logger_connector.teardown() @@ -1806,7 +1807,7 @@ def model(self) -> torch.nn.Module: To access the pure LightningModule, use :meth:`~pytorch_lightning.trainer.trainer.Trainer.lightning_module` instead. """ - return self.accelerator.model + return self.training_type_plugin.model @model.setter def model(self, model: torch.nn.Module) -> None: @@ -1817,7 +1818,7 @@ def model(self, model: torch.nn.Module) -> None: model: The LightningModule, possibly wrapped into DataParallel or DistributedDataParallel, depending on the backend. """ - self.accelerator.model = model + self.training_type_plugin.model = model """ General properties diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 830cd9a51ffb1..82e2864fac520 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -411,24 +411,24 @@ class Prec(PrecisionPlugin): class TrainTypePlugin(SingleDevicePlugin): pass - ttp = TrainTypePlugin(device=torch.device("cpu")) - accelerator = Accel(training_type_plugin=ttp, precision_plugin=Prec()) - trainer = Trainer(accelerator=accelerator, fast_dev_run=True, num_processes=2) - assert isinstance(trainer.accelerator, Accel) - assert isinstance(trainer.training_type_plugin, TrainTypePlugin) - assert isinstance(trainer.precision_plugin, Prec) - assert trainer._accelerator_connector.training_type_plugin is ttp - - class DistributedPlugin(DDPPlugin): - pass - - ttp = DistributedPlugin() - accelerator = Accel(training_type_plugin=ttp, precision_plugin=Prec()) - trainer = Trainer(accelerator=accelerator, fast_dev_run=True, num_processes=2) - assert isinstance(trainer.accelerator, Accel) - assert isinstance(trainer.training_type_plugin, DistributedPlugin) - assert isinstance(trainer.precision_plugin, Prec) - assert trainer._accelerator_connector.training_type_plugin is ttp +# ttp = TrainTypePlugin(device=torch.device("cpu")) +# accelerator = Accel(training_type_plugin=ttp, precision_plugin=Prec()) +# trainer = Trainer(accelerator=accelerator, fast_dev_run=True, num_processes=2) +# assert isinstance(trainer.accelerator, Accel) +# assert isinstance(trainer.training_type_plugin, TrainTypePlugin) +# assert isinstance(trainer.precision_plugin, Prec) +# assert trainer._accelerator_connector.training_type_plugin is ttp + +# class DistributedPlugin(DDPPlugin): +# pass + +# ttp = DistributedPlugin() +# accelerator = Accel(training_type_plugin=ttp, precision_plugin=Prec()) +# trainer = Trainer(accelerator=accelerator, fast_dev_run=True, num_processes=2) +# assert isinstance(trainer.accelerator, Accel) +# assert isinstance(trainer.training_type_plugin, DistributedPlugin) +# assert isinstance(trainer.precision_plugin, Prec) +# assert trainer._accelerator_connector.training_type_plugin is ttp @mock.patch.dict( From 7637a7c25043ecfd8b2a3a0dd7b55d9bf4c6f82b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Dec 2021 02:54:20 +0000 Subject: [PATCH 12/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/cpu.py | 1 + tests/accelerators/test_accelerator_connector.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 0608f12381ea1..739503efdd898 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -17,6 +17,7 @@ # import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator + # from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 82e2864fac520..19e5755e54341 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -411,6 +411,7 @@ class Prec(PrecisionPlugin): class TrainTypePlugin(SingleDevicePlugin): pass + # ttp = TrainTypePlugin(device=torch.device("cpu")) # accelerator = Accel(training_type_plugin=ttp, precision_plugin=Prec()) # trainer = Trainer(accelerator=accelerator, fast_dev_run=True, num_processes=2) From d378475d707ce6a12251bcea612b085fdadc24aa Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 9 Dec 2021 19:19:11 -0800 Subject: [PATCH 13/42] fix doc build and some mypy --- docs/source/extensions/accelerators.rst | 5 +---- docs/source/extensions/plugins.rst | 5 +---- tests/accelerators/test_gpu.py | 2 -- tests/accelerators/test_tpu.py | 29 ++++++++++++++----------- 4 files changed, 18 insertions(+), 23 deletions(-) diff --git a/docs/source/extensions/accelerators.rst b/docs/source/extensions/accelerators.rst index 4db625adea5dc..fdc3468b90761 100644 --- a/docs/source/extensions/accelerators.rst +++ b/docs/source/extensions/accelerators.rst @@ -25,10 +25,7 @@ One to handle differences from the training routine and one to handle different from pytorch_lightning.accelerators import GPUAccelerator from pytorch_lightning.plugins import NativeMixedPrecisionPlugin, DDPPlugin - accelerator = GPUAccelerator( - precision_plugin=NativeMixedPrecisionPlugin(precision=16, device="cuda"), - training_type_plugin=DDPPlugin(), - ) + accelerator = GPUAccelerator() trainer = Trainer(accelerator=accelerator) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 78c6503fea34d..0c9a6bbbbbc1c 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -80,10 +80,7 @@ can then be passed into the Trainer directly or via a (custom) accelerator: trainer = Trainer(strategy=CustomDDPPlugin(), plugins=[CustomPrecisionPlugin()]) # fully custom accelerator and plugins - accelerator = MyAccelerator( - precision_plugin=CustomPrecisionPlugin(), - training_type_plugin=CustomDDPPlugin(), - ) + accelerator = MyAccelerator() trainer = Trainer(accelerator=accelerator) diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py index fc01c932b2635..ece78da24972d 100644 --- a/tests/accelerators/test_gpu.py +++ b/tests/accelerators/test_gpu.py @@ -1,8 +1,6 @@ import torch from pytorch_lightning.accelerators import GPUAccelerator -from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin from tests.helpers.runif import RunIf diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index ef20df39d6acc..bad3036764b1d 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -288,25 +288,28 @@ def forward(self, x): def test_tpu_invalid_raises(): + # TODO move TPUAccelerator() and CPUAccelerator() setup() misconfig logic into strategies training_type_plugin = TPUSpawnPlugin(accelerator=TPUAccelerator(), precision_plugin=Mock()) - with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): - training_type_plugin.setup(Mock()) + # with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): + # training_type_plugin.setup(Mock()) training_type_plugin = DDPPlugin(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin()) - with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugi"): - training_type_plugin.setup(Mock()) + # with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin`): + # training_type_plugin.setup(Mock()) def test_tpu_invalid_raises_set_precision_with_strategy(): - accelerator = TPUAccelerator(object(), TPUSpawnPlugin(precision_plugin=object())) - with pytest.raises(ValueError, match="`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): - training_type_plugin.setup(object()) - - accelerator = TPUAccelerator(None, DDPPlugin(precision_plugin=TPUPrecisionPlugin())) - with pytest.raises( - ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin" - ): - training_type_plugin.setup(object()) + accelerator = TPUAccelerator() + training_type_plugin = TPUSpawnPlugin(accelerator=accelerator, precision_plugin=object()) + # with pytest.raises(ValueError, match="`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): + # training_type_plugin.setup(object()) + + accelerator = TPUAccelerator() + training_type_plugin = DDPPlugin(accelerator=accelerator, precision_plugin=TPUPrecisionPlugin()) + # with pytest.raises( + # ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin" + # ): + # training_type_plugin.setup(object()) @RunIf(tpu=True) From 2810731534945f1288ee6f8791733785e4b223dc Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Thu, 9 Dec 2021 19:24:55 -0800 Subject: [PATCH 14/42] fix lite --- pytorch_lightning/lite/lite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py index 4df019d3838cd..a07ed1cc3dfab 100644 --- a/pytorch_lightning/lite/lite.py +++ b/pytorch_lightning/lite/lite.py @@ -99,8 +99,8 @@ def __init__( amp_level=None, plugins=plugins, ) - self._accelerator = self._accelerator_connector.accelerator - self._strategy = self._accelerator.training_type_plugin + self._strategy = self._accelerator_connector.training_type_plugin + self._accelerator = self._strategy.accelerator self._precision_plugin = self._strategy.precision_plugin self._models_setup: int = 0 From 0f9d245a1cedab5ed72f4febf98a2e59c423610c Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 10 Dec 2021 11:13:56 -0800 Subject: [PATCH 15/42] fix gpu setup environment --- docs/source/extensions/accelerators.rst | 4 ++- pytorch_lightning/accelerators/accelerator.py | 2 +- pytorch_lightning/accelerators/cpu.py | 26 +++++++------------ pytorch_lightning/accelerators/gpu.py | 22 +++++++--------- .../training_type/training_type_plugin.py | 2 +- 5 files changed, 24 insertions(+), 32 deletions(-) diff --git a/docs/source/extensions/accelerators.rst b/docs/source/extensions/accelerators.rst index fdc3468b90761..9e7615aec054d 100644 --- a/docs/source/extensions/accelerators.rst +++ b/docs/source/extensions/accelerators.rst @@ -26,7 +26,9 @@ One to handle differences from the training routine and one to handle different from pytorch_lightning.plugins import NativeMixedPrecisionPlugin, DDPPlugin accelerator = GPUAccelerator() - trainer = Trainer(accelerator=accelerator) + precision_plugin=NativeMixedPrecisionPlugin(precision=16, device="cuda") + training_type_plugin=DDPPlugin(accelerator=accelerator, precision_plugin=precision_plugin) + trainer = Trainer(strategy=training_type_plugin) We expose Accelerators and Plugins mainly for expert users who want to extend Lightning to work with new diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index a7122f4331fa9..093065394b337 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -30,7 +30,7 @@ class Accelerator: - IPU """ - def setup_environment(self) -> None: + def setup_environment(self, root_device: torch.device) -> None: """Setup any processes or distributed connections. This is called before the LightningModule/DataModule setup hook which allows the user to access the accelerator diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 739503efdd898..40c9a3c2b918c 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -15,29 +15,21 @@ import torch -# import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator - -# from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.exceptions import MisconfigurationException class CPUAccelerator(Accelerator): """Accelerator for CPU devices.""" - # @property - # def root_device(self): - # return torch.device("cpu") - - # def setup(self, trainer: "pl.Trainer") -> None: - # """ - # Raises: - # MisconfigurationException: - # If the selected device is not CPU. - # """ - # if "cpu" not in str(self.root_device): - # raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.") - - # return super().setup(trainer) + def setup_environment(self, root_device: torch.device) -> None: + """ + Raises: + MisconfigurationException: + If the selected device is not CPU. + """ + if "cpu" not in str(root_device): + raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """CPU device stats aren't supported yet.""" diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index a9b2e783d9abc..1c53560d67e7d 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -21,8 +21,7 @@ import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator - -# from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 _log = logging.getLogger(__name__) @@ -31,19 +30,18 @@ class GPUAccelerator(Accelerator): """Accelerator for GPU devices.""" - # def setup_environment(self) -> None: - # """ - # Raises: - # MisconfigurationException: - # If the selected device is not GPU. - # """ - # if "cuda" not in str(self.root_device): - # raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") - # torch.cuda.set_device(self.root_device) + def setup_environment(self, root_device: torch.device) -> None: + """ + Raises: + MisconfigurationException: + If the selected device is not GPU. + """ + if "cuda" not in str(root_device): + raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") + torch.cuda.set_device(root_device) def setup(self, trainer: "pl.Trainer") -> None: self.set_nvidia_flags(trainer.local_rank) - super().setup(trainer) # clear cache before training torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 335a4edb4754e..f1a46d05b0e83 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -93,7 +93,7 @@ def setup_environment(self) -> None: This is called before the LightningModule/DataModule setup hook which allows the user to access the accelerator environment before setup is complete. """ - self.accelerator.setup_environment() + self.accelerator.setup_environment(self.root_device) def setup_optimizers(self, trainer: "pl.Trainer") -> None: """Creates optimizers and schedulers. From cc2648a61d2f4ebcb5c6351eb54d0c5149a332e1 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 10 Dec 2021 13:18:04 -0800 Subject: [PATCH 16/42] support customized ttp and accelerator --- pytorch_lightning/accelerators/gpu.py | 1 + pytorch_lightning/accelerators/tpu.py | 24 ---------- .../connectors/accelerator_connector.py | 34 +++++++++++-- .../test_accelerator_connector.py | 48 +++++++++++-------- tests/accelerators/test_tpu.py | 21 ++++---- 5 files changed, 68 insertions(+), 60 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 1c53560d67e7d..06ade654fca92 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -41,6 +41,7 @@ def setup_environment(self, root_device: torch.device) -> None: torch.cuda.set_device(root_device) def setup(self, trainer: "pl.Trainer") -> None: + # TODO refactor input from trainer to local_rank @four4fish self.set_nvidia_flags(trainer.local_rank) # clear cache before training torch.cuda.empty_cache() diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index e0f633c0015c0..34c37dcd95e7f 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -15,12 +15,7 @@ import torch -# import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator - -# from pytorch_lightning.plugins.precision import TPUPrecisionPlugin -# from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin -# from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin from pytorch_lightning.utilities import _XLA_AVAILABLE if _XLA_AVAILABLE: @@ -30,25 +25,6 @@ class TPUAccelerator(Accelerator): """Accelerator for TPU devices.""" - # def setup(self, trainer: "pl.Trainer") -> None: - # """ - # Raises: - # ValueError: - # If the precision or training type plugin are unsupported. - # """ - # if not isinstance(self.training_type_plugin.precision_plugin, TPUPrecisionPlugin): - # # this configuration should have been avoided in the accelerator connector - # raise ValueError( - # f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," - # f" found: {self.training_type_plugin.precision_plugin}." - # ) - # if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)): - # raise ValueError( - # "The `TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin," - # f" found {self.training_type_plugin}." - # ) - # return super().setup(trainer) - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given TPU device. diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 0e8e798e5eb86..cfe8aa678d3e4 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -178,6 +178,8 @@ def __init__( self.training_type_plugin = self.final_training_type_plugin() self.accelerator = self.training_type_plugin.accelerator + self._check_tpu_mis_config() + # benchmarking # TODO: should this be moved to GPU accelerator? torch.backends.cudnn.benchmark = self.benchmark @@ -405,12 +407,19 @@ def final_training_type_plugin(self) -> TrainingTypePlugin: # attach checkpoint plugin to the training type plugin if self._checkpoint_io is not None: self._training_type_plugin.checkpoint_io = self._checkpoint_io - precision_plugin = self.precision_plugin - if precision_plugin is not None: - self._training_type_plugin._precision_plugin = precision_plugin + if ( + (hasattr(self.strategy, "precision_plugin") and self.precision_plugin is None) + or not hasattr(self.strategy, "precision_plugin") + ): + precision_plugin = self.precision_plugin + if precision_plugin is not None: + self._training_type_plugin._precision_plugin = precision_plugin self._training_type_plugin_resolved = True - - self._training_type_plugin.accelerator = self.select_accelerator() + if ( + (hasattr(self.strategy, "accelerator") and self.strategy.accelerator is None) + or not hasattr(self.strategy, "accelerator") + ): + self._training_type_plugin.accelerator = self.select_accelerator() return self._training_type_plugin @property @@ -1016,3 +1025,18 @@ def _is_slurm_managing_tasks(self) -> bool: total_requested_devices = (self.num_gpus or self.num_processes) * self.num_nodes num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) return num_slurm_tasks == total_requested_devices + + def _check_tpu_mis_config(self) -> None: + # TODO moved from TPUAccelerator when refactor accelerator. Revisit when refactor + # accelerator_connector @four4fish + if isinstance(self.accelerator, TPUAccelerator): + if not isinstance(self.training_type_plugin.precision_plugin, TPUPrecisionPlugin): + raise ValueError( + f"The `TPUAccelerator` can only be used with a `TPUPrecisionPlugin`," + f" found: {self.training_type_plugin.precision_plugin}." + ) + if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)): + raise ValueError( + "The `TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin," + f" found {self.training_type_plugin}." + ) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 19e5755e54341..d88484d6fdef4 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -397,7 +397,14 @@ def creates_processes_externally(self) -> bool: @mock.patch.dict( os.environ, - {"SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", "SLURM_LOCALID": "0"}, + { + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_PROCID": "0", + "SLURM_LOCALID": "0", + }, ) @mock.patch("torch.cuda.device_count", return_value=0) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) @@ -408,28 +415,29 @@ class Accel(Accelerator): class Prec(PrecisionPlugin): pass - class TrainTypePlugin(SingleDevicePlugin): + class TrainTypePlugin(DDPPlugin): pass + ttp = TrainTypePlugin( + device=torch.device("cpu"), + accelerator=Accel(), + precision_plugin=Prec() + ) + trainer = Trainer(strategy=ttp, fast_dev_run=True, num_processes=2) + assert isinstance(trainer.accelerator, Accel) + assert isinstance(trainer.training_type_plugin, TrainTypePlugin) + assert isinstance(trainer.precision_plugin, Prec) + assert trainer._accelerator_connector.training_type_plugin is ttp + + class DistributedPlugin(DDPPlugin): + pass -# ttp = TrainTypePlugin(device=torch.device("cpu")) -# accelerator = Accel(training_type_plugin=ttp, precision_plugin=Prec()) -# trainer = Trainer(accelerator=accelerator, fast_dev_run=True, num_processes=2) -# assert isinstance(trainer.accelerator, Accel) -# assert isinstance(trainer.training_type_plugin, TrainTypePlugin) -# assert isinstance(trainer.precision_plugin, Prec) -# assert trainer._accelerator_connector.training_type_plugin is ttp - -# class DistributedPlugin(DDPPlugin): -# pass - -# ttp = DistributedPlugin() -# accelerator = Accel(training_type_plugin=ttp, precision_plugin=Prec()) -# trainer = Trainer(accelerator=accelerator, fast_dev_run=True, num_processes=2) -# assert isinstance(trainer.accelerator, Accel) -# assert isinstance(trainer.training_type_plugin, DistributedPlugin) -# assert isinstance(trainer.precision_plugin, Prec) -# assert trainer._accelerator_connector.training_type_plugin is ttp + ttp = DistributedPlugin(accelerator=Accel(), precision_plugin=Prec()) + trainer = Trainer(strategy=ttp, fast_dev_run=True, num_processes=2) + assert isinstance(trainer.accelerator, Accel) + assert isinstance(trainer.training_type_plugin, DistributedPlugin) + assert isinstance(trainer.precision_plugin, Prec) + assert trainer._accelerator_connector.training_type_plugin is ttp @mock.patch.dict( diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index bad3036764b1d..65d607fc32ef3 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -288,28 +288,27 @@ def forward(self, x): def test_tpu_invalid_raises(): - # TODO move TPUAccelerator() and CPUAccelerator() setup() misconfig logic into strategies training_type_plugin = TPUSpawnPlugin(accelerator=TPUAccelerator(), precision_plugin=Mock()) - # with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): - # training_type_plugin.setup(Mock()) + with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): + Trainer(strategy=training_type_plugin) training_type_plugin = DDPPlugin(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin()) - # with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin`): - # training_type_plugin.setup(Mock()) + with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin`"): + Trainer(strategy=training_type_plugin) def test_tpu_invalid_raises_set_precision_with_strategy(): accelerator = TPUAccelerator() training_type_plugin = TPUSpawnPlugin(accelerator=accelerator, precision_plugin=object()) - # with pytest.raises(ValueError, match="`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): - # training_type_plugin.setup(object()) + with pytest.raises(ValueError, match="`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): + Trainer(strategy=training_type_plugin) accelerator = TPUAccelerator() training_type_plugin = DDPPlugin(accelerator=accelerator, precision_plugin=TPUPrecisionPlugin()) - # with pytest.raises( - # ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin" - # ): - # training_type_plugin.setup(object()) + with pytest.raises( + ValueError, match="The `TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin" + ): + Trainer(strategy=training_type_plugin) @RunIf(tpu=True) From 34b95448d5baef41d5c57f47ec223222407fa69b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Dec 2021 19:16:18 +0000 Subject: [PATCH 17/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/source/extensions/accelerators.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/extensions/accelerators.rst b/docs/source/extensions/accelerators.rst index 9e7615aec054d..753749d8a3730 100644 --- a/docs/source/extensions/accelerators.rst +++ b/docs/source/extensions/accelerators.rst @@ -26,8 +26,8 @@ One to handle differences from the training routine and one to handle different from pytorch_lightning.plugins import NativeMixedPrecisionPlugin, DDPPlugin accelerator = GPUAccelerator() - precision_plugin=NativeMixedPrecisionPlugin(precision=16, device="cuda") - training_type_plugin=DDPPlugin(accelerator=accelerator, precision_plugin=precision_plugin) + precision_plugin = NativeMixedPrecisionPlugin(precision=16, device="cuda") + training_type_plugin = DDPPlugin(accelerator=accelerator, precision_plugin=precision_plugin) trainer = Trainer(strategy=training_type_plugin) From 46283e277a7dbd965ed6c1148c818fefebd6ca98 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 10 Dec 2021 15:29:43 -0800 Subject: [PATCH 18/42] fix tpu error check --- .../accelerators/test_accelerator_connector.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index d88484d6fdef4..2b42c4d62848f 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -418,11 +418,7 @@ class Prec(PrecisionPlugin): class TrainTypePlugin(DDPPlugin): pass - ttp = TrainTypePlugin( - device=torch.device("cpu"), - accelerator=Accel(), - precision_plugin=Prec() - ) + ttp = TrainTypePlugin(device=torch.device("cpu"), accelerator=Accel(), precision_plugin=Prec()) trainer = Trainer(strategy=ttp, fast_dev_run=True, num_processes=2) assert isinstance(trainer.accelerator, Accel) assert isinstance(trainer.training_type_plugin, TrainTypePlugin) @@ -1038,10 +1034,13 @@ def test_unsupported_tpu_choice(monkeypatch): with pytest.raises(MisconfigurationException, match=r"accelerator='tpu', precision=64\)` is not implemented"): Trainer(accelerator="tpu", precision=64) - with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but native AMP is not supported"): - Trainer(accelerator="tpu", precision=16) - with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but apex AMP is not supported"): - Trainer(accelerator="tpu", precision=16, amp_backend="apex") + with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin`"): + with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but native AMP is not supported"): + Trainer(accelerator="tpu", precision=16) + + with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUPlugin`"): + with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but apex AMP is not supported"): + Trainer(accelerator="tpu", precision=16, amp_backend="apex") def test_unsupported_ipu_choice(monkeypatch): From e6dfafea04bb27a9abe333e0553c4fe2b4e11a1b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Dec 2021 23:44:19 +0000 Subject: [PATCH 19/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/connectors/accelerator_connector.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index cfe8aa678d3e4..bf43a9130e307 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -407,17 +407,15 @@ def final_training_type_plugin(self) -> TrainingTypePlugin: # attach checkpoint plugin to the training type plugin if self._checkpoint_io is not None: self._training_type_plugin.checkpoint_io = self._checkpoint_io - if ( - (hasattr(self.strategy, "precision_plugin") and self.precision_plugin is None) - or not hasattr(self.strategy, "precision_plugin") + if (hasattr(self.strategy, "precision_plugin") and self.precision_plugin is None) or not hasattr( + self.strategy, "precision_plugin" ): precision_plugin = self.precision_plugin if precision_plugin is not None: self._training_type_plugin._precision_plugin = precision_plugin self._training_type_plugin_resolved = True - if ( - (hasattr(self.strategy, "accelerator") and self.strategy.accelerator is None) - or not hasattr(self.strategy, "accelerator") + if (hasattr(self.strategy, "accelerator") and self.strategy.accelerator is None) or not hasattr( + self.strategy, "accelerator" ): self._training_type_plugin.accelerator = self.select_accelerator() return self._training_type_plugin From 347a4f1526e3a646036a6cd6af23cd2c817e0ca1 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Fri, 10 Dec 2021 17:10:28 -0800 Subject: [PATCH 20/42] fix precision_plugin initialization to recognisze cusomized plugin --- .../training_type/training_type_plugin.py | 10 ++++++---- .../connectors/accelerator_connector.py | 18 +++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index f1a46d05b0e83..364365baae59b 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -51,7 +51,7 @@ def __init__( self._model: Optional[Module] = None checkpoint_io = checkpoint_io if checkpoint_io is not None else TorchCheckpointIO() self._checkpoint_io = checkpoint_io - self._precision_plugin = precision_plugin if precision_plugin is not None else PrecisionPlugin() + self._precision_plugin = precision_plugin self.optimizers: List[Optimizer] = [] self.lr_schedulers: List[_LRScheduler] = [] self.optimizer_frequencies: List[int] = [] @@ -67,8 +67,6 @@ def accelerator(self) -> "pl.Accelerator": @accelerator.setter def accelerator(self, accelerator: "pl.Accelerator") -> None: - # if self._accelerator is not None: - # raise ValueError("Accelerator already set.") self._accelerator = accelerator @property @@ -77,7 +75,11 @@ def checkpoint_io(self) -> CheckpointIO: @property def precision_plugin(self) -> PrecisionPlugin: - return self._precision_plugin + return self._precision_plugin if self._precision_plugin is not None else PrecisionPlugin() + + @precision_plugin.setter + def precision_plugin(self, precision_plugin: Optional[PrecisionPlugin]) -> None: + self._precision_plugin = precision_plugin @checkpoint_io.setter def checkpoint_io(self, plugin: CheckpointIO) -> None: diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index bf43a9130e307..6885887e953e3 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -174,10 +174,8 @@ def __init__( self._validate_accelerator_type() self._set_devices_if_none() - self._training_type_plugin_resolved = False self.training_type_plugin = self.final_training_type_plugin() self.accelerator = self.training_type_plugin.accelerator - self._check_tpu_mis_config() # benchmarking @@ -398,24 +396,22 @@ def precision_plugin(self) -> PrecisionPlugin: return self._precision_plugin def final_training_type_plugin(self) -> TrainingTypePlugin: - if self._training_type_plugin_resolved: - # avoid calling `resolve_training_type_plugin` multiple times - return self._training_type_plugin if self._training_type_plugin is None: self._training_type_plugin = self.select_training_type_plugin() self._training_type_plugin = self.resolve_training_type_plugin(self._training_type_plugin) # attach checkpoint plugin to the training type plugin if self._checkpoint_io is not None: self._training_type_plugin.checkpoint_io = self._checkpoint_io - if (hasattr(self.strategy, "precision_plugin") and self.precision_plugin is None) or not hasattr( - self.strategy, "precision_plugin" + if ( + (isinstance(self.strategy, TrainingTypePlugin) and self.strategy._precision_plugin is None) + or not isinstance(self.strategy, TrainingTypePlugin) ): precision_plugin = self.precision_plugin if precision_plugin is not None: - self._training_type_plugin._precision_plugin = precision_plugin - self._training_type_plugin_resolved = True - if (hasattr(self.strategy, "accelerator") and self.strategy.accelerator is None) or not hasattr( - self.strategy, "accelerator" + self._training_type_plugin.precision_plugin = precision_plugin + if ( + (isinstance(self.strategy, TrainingTypePlugin) and self.strategy.accelerator is None) + or not isinstance(self.strategy, TrainingTypePlugin) ): self._training_type_plugin.accelerator = self.select_accelerator() return self._training_type_plugin From c0120d0fea2dcd0c8de278323a918efd6ba6716c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 11 Dec 2021 01:12:08 +0000 Subject: [PATCH 21/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/connectors/accelerator_connector.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 6885887e953e3..b4e2bb2a56e61 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -403,15 +403,13 @@ def final_training_type_plugin(self) -> TrainingTypePlugin: if self._checkpoint_io is not None: self._training_type_plugin.checkpoint_io = self._checkpoint_io if ( - (isinstance(self.strategy, TrainingTypePlugin) and self.strategy._precision_plugin is None) - or not isinstance(self.strategy, TrainingTypePlugin) - ): + isinstance(self.strategy, TrainingTypePlugin) and self.strategy._precision_plugin is None + ) or not isinstance(self.strategy, TrainingTypePlugin): precision_plugin = self.precision_plugin if precision_plugin is not None: self._training_type_plugin.precision_plugin = precision_plugin - if ( - (isinstance(self.strategy, TrainingTypePlugin) and self.strategy.accelerator is None) - or not isinstance(self.strategy, TrainingTypePlugin) + if (isinstance(self.strategy, TrainingTypePlugin) and self.strategy.accelerator is None) or not isinstance( + self.strategy, TrainingTypePlugin ): self._training_type_plugin.accelerator = self.select_accelerator() return self._training_type_plugin From f08163b3cd49dcfd8f7e3e3eb45bb97cbf9fa7c0 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 13 Dec 2021 17:57:21 -0800 Subject: [PATCH 22/42] Update bug_report_model.py --- pl_examples/bug_report/bug_report_model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pl_examples/bug_report/bug_report_model.py b/pl_examples/bug_report/bug_report_model.py index 44a1f136c13a0..7739630237d32 100644 --- a/pl_examples/bug_report/bug_report_model.py +++ b/pl_examples/bug_report/bug_report_model.py @@ -57,9 +57,6 @@ def run(): num_sanity_val_steps=0, max_epochs=1, enable_model_summary=False, - accelerator="cpu", - strategy="ddp", - devices=2, ) trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data) trainer.test(model, dataloaders=test_data) From 34c13ff2d43f7863605d21bd0970ad41f28a35b5 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 13 Dec 2021 17:58:25 -0800 Subject: [PATCH 23/42] Update accelerator_connector.py --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index b4e2bb2a56e61..0e90dda6206ad 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -800,9 +800,6 @@ def select_accelerator(self) -> Accelerator: acc_cls = CPUAccelerator accelerator = acc_cls() - # transfer ownership of the plugins to the accelerator - # self._training_type_plugin = proxy(self.training_type_plugin) - return accelerator def select_cluster_environment(self) -> ClusterEnvironment: From 6bdb464ba01b9866e5de66b8bc11ea45eba23f7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 05:16:17 +0100 Subject: [PATCH 24/42] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f95ebaebac6c..933f570cf541b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -114,6 +114,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed duplicated file extension when uploading model checkpoints with `NeptuneLogger` ([#11015](https://github.com/PyTorchLightning/pytorch-lightning/pull/11015)) +- Moved ownership of the `Accelerator` instance to the `TrainingTypePlugin`; all training-type plugins now take an optional parameter `accelerator` ([#11022](https://github.com/PyTorchLightning/pytorch-lightning/pull/11022)) + ### Deprecated From c039c682a2404970c514f6d3d86d4cbcc2e6271d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 05:16:35 +0100 Subject: [PATCH 25/42] allow shorthand typing references to pl.Accelerator --- pytorch_lightning/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index c9d914573fe71..6c6ed11063c93 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -13,12 +13,13 @@ _logger.addHandler(logging.StreamHandler()) _logger.propagate = False +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.callbacks import Callback # noqa: E402 from pytorch_lightning.core import LightningDataModule, LightningModule # noqa: E402 from pytorch_lightning.trainer import Trainer # noqa: E402 from pytorch_lightning.utilities.seed import seed_everything # noqa: E402 -__all__ = ["Trainer", "LightningDataModule", "LightningModule", "Callback", "seed_everything"] +__all__ = ["Accelerator", "Trainer", "LightningDataModule", "LightningModule", "Callback", "seed_everything"] # for compatibility with namespace packages __import__("pkg_resources").declare_namespace(__name__) From 0976c503bfd6d28ca1f9e8af461fc29be10b0c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 05:24:46 +0100 Subject: [PATCH 26/42] rename helper method and add docstring --- .../trainer/connectors/accelerator_connector.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 0e90dda6206ad..ac26d5bc29012 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -176,7 +176,7 @@ def __init__( self.training_type_plugin = self.final_training_type_plugin() self.accelerator = self.training_type_plugin.accelerator - self._check_tpu_mis_config() + self._check_plugin_compatibility() # benchmarking # TODO: should this be moved to GPU accelerator? @@ -1015,7 +1015,12 @@ def _is_slurm_managing_tasks(self) -> bool: num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) return num_slurm_tasks == total_requested_devices - def _check_tpu_mis_config(self) -> None: + def _check_plugin_compatibility(self) -> None: + """Checks that selected plugins are compatible with each other. + + Raises: + ValueError: If an invalid combination of Accelerator, TrainingTypePlugin, PrecisionPlugin is found. + """ # TODO moved from TPUAccelerator when refactor accelerator. Revisit when refactor # accelerator_connector @four4fish if isinstance(self.accelerator, TPUAccelerator): From 7b1738c42a932971acea54e18de15fa948d13bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 05:34:38 +0100 Subject: [PATCH 27/42] fix typing --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8883d1862b801..bed5c1d4f9938 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -452,7 +452,7 @@ def __init__( plugins, ) self._accelerator_connector.training_type_plugin - self.logger_connector = LoggerConnector(self, log_gpu_memory) + self.logger_connector: LoggerConnector = LoggerConnector(self, log_gpu_memory) self._callback_connector = CallbackConnector(self) self.checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint) self.signal_connector = SignalConnector(self) From 2f18893f490bc689751b56725c2a77d0bbd90d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 07:58:24 +0100 Subject: [PATCH 28/42] Update pytorch_lightning/trainer/connectors/accelerator_connector.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index ac26d5bc29012..691dbd81e2cb3 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -1031,6 +1031,6 @@ def _check_plugin_compatibility(self) -> None: ) if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)): raise ValueError( - "The `TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin," + "The `TPUAccelerator` can only be used with a `SingleTPUPlugin` or `TPUSpawnPlugin`," f" found {self.training_type_plugin}." ) From bf97a58a1686113aee8793ddbeca72496913e0e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 07:58:42 +0100 Subject: [PATCH 29/42] Update tests/accelerators/test_accelerator_connector.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- tests/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 2b42c4d62848f..394a72734189c 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -403,7 +403,7 @@ def creates_processes_externally(self) -> bool: "SLURM_NODEID": "0", "LOCAL_RANK": "0", "SLURM_PROCID": "0", - "SLURM_LOCALID": "0", + "SLURM_LOCALID": "0" }, ) @mock.patch("torch.cuda.device_count", return_value=0) From e0f4a7775ab71336d553684290301a8d13f1002f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 08:00:01 +0100 Subject: [PATCH 30/42] Update tests/accelerators/test_cpu.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- tests/accelerators/test_cpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py index 566952185252c..1ceaad5d1183b 100644 --- a/tests/accelerators/test_cpu.py +++ b/tests/accelerators/test_cpu.py @@ -22,7 +22,6 @@ def test_restore_checkpoint_after_pre_dispatch_default(): precision_plugin=PrecisionPlugin() ) assert not plugin.restore_checkpoint_after_pre_dispatch - assert not plugin.restore_checkpoint_after_pre_dispatch @pytest.mark.parametrize("restore_after_pre_dispatch", [True, False]) From 548851981c08b06fc001fac8509bc87ab41e4796 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Dec 2021 07:00:43 +0000 Subject: [PATCH 31/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 394a72734189c..2b42c4d62848f 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -403,7 +403,7 @@ def creates_processes_externally(self) -> bool: "SLURM_NODEID": "0", "LOCAL_RANK": "0", "SLURM_PROCID": "0", - "SLURM_LOCALID": "0" + "SLURM_LOCALID": "0", }, ) @mock.patch("torch.cuda.device_count", return_value=0) From b69537e7b8c6d7e41b1dbea025a42cddef849915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 16:40:32 +0100 Subject: [PATCH 32/42] fix pre commit complaint --- pytorch_lightning/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 6c6ed11063c93..8114e318012ea 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -13,7 +13,7 @@ _logger.addHandler(logging.StreamHandler()) _logger.propagate = False -from pytorch_lightning.accelerators import Accelerator +from pytorch_lightning.accelerators import Accelerator # noqa: E402 from pytorch_lightning.callbacks import Callback # noqa: E402 from pytorch_lightning.core import LightningDataModule, LightningModule # noqa: E402 from pytorch_lightning.trainer import Trainer # noqa: E402 From 94fe8f84dd066d51146aad111391f83dace2bdc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 18:15:36 +0100 Subject: [PATCH 33/42] update typing to long ugly path --- pytorch_lightning/__init__.py | 3 +-- pytorch_lightning/plugins/training_type/ddp.py | 2 +- pytorch_lightning/plugins/training_type/ddp_spawn.py | 2 +- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- pytorch_lightning/plugins/training_type/dp.py | 2 +- pytorch_lightning/plugins/training_type/fully_sharded.py | 2 +- pytorch_lightning/plugins/training_type/horovod.py | 2 +- pytorch_lightning/plugins/training_type/ipu.py | 2 +- pytorch_lightning/plugins/training_type/parallel.py | 2 +- pytorch_lightning/plugins/training_type/single_device.py | 2 +- pytorch_lightning/plugins/training_type/single_tpu.py | 2 +- pytorch_lightning/plugins/training_type/tpu_spawn.py | 2 +- .../plugins/training_type/training_type_plugin.py | 6 +++--- 13 files changed, 15 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 8114e318012ea..c9d914573fe71 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -13,13 +13,12 @@ _logger.addHandler(logging.StreamHandler()) _logger.propagate = False -from pytorch_lightning.accelerators import Accelerator # noqa: E402 from pytorch_lightning.callbacks import Callback # noqa: E402 from pytorch_lightning.core import LightningDataModule, LightningModule # noqa: E402 from pytorch_lightning.trainer import Trainer # noqa: E402 from pytorch_lightning.utilities.seed import seed_everything # noqa: E402 -__all__ = ["Accelerator", "Trainer", "LightningDataModule", "LightningModule", "Callback", "seed_everything"] +__all__ = ["Trainer", "LightningDataModule", "LightningModule", "Callback", "seed_everything"] # for compatibility with namespace packages __import__("pkg_resources").declare_namespace(__name__) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index bc1db13d1490f..46c6f4d0ac1bb 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -84,7 +84,7 @@ class DDPPlugin(ParallelPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index d433d5a93c0cc..76b0db3e1370d 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -62,7 +62,7 @@ class DDPSpawnPlugin(ParallelPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 6aee74f38b004..88fa34905ba1e 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -88,7 +88,7 @@ class DeepSpeedPlugin(DDPPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, zero_optimization: bool = True, stage: int = 2, remote_device: str = "cpu", diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 71ee0fc7b9d86..decadb3f0ce5d 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -35,7 +35,7 @@ class DataParallelPlugin(ParallelPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, diff --git a/pytorch_lightning/plugins/training_type/fully_sharded.py b/pytorch_lightning/plugins/training_type/fully_sharded.py index 2824fa3e03d33..475701e13f593 100644 --- a/pytorch_lightning/plugins/training_type/fully_sharded.py +++ b/pytorch_lightning/plugins/training_type/fully_sharded.py @@ -37,7 +37,7 @@ class DDPFullyShardedPlugin(DDPPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, cpu_offload: bool = False, flatten_parameters: bool = True, reshard_after_forward: bool = True, diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 67ef3b492feb4..858d290b20d5b 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -41,7 +41,7 @@ class HorovodPlugin(ParallelPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index a9033045c22c0..9a1ddaf9b38d1 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -62,7 +62,7 @@ class IPUPlugin(ParallelPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, device_iterations: int = 1, autoreport: bool = False, autoreport_dir: Optional[str] = None, diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index 8eff90289e199..9e1967fc64409 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -34,7 +34,7 @@ class ParallelPlugin(TrainingTypePlugin, ABC): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 288a47b2dde77..ca95330281cb0 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -28,7 +28,7 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__( self, device: torch.device, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ): diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index aa6c508f2bc7b..34bb0b01f4ffd 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -34,7 +34,7 @@ class SingleTPUPlugin(SingleDevicePlugin): def __init__( self, device: int, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, debug: bool = False, diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index f4215e08663c9..f6c85b060e0a0 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -54,7 +54,7 @@ class TPUSpawnPlugin(DDPSpawnPlugin): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, parallel_devices: Optional[List[int]] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 364365baae59b..b0af138a460ff 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -43,7 +43,7 @@ class TrainingTypePlugin(ABC): def __init__( self, - accelerator: Optional["pl.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerators.accelerator.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ) -> None: @@ -62,11 +62,11 @@ def __init__( ) @property - def accelerator(self) -> "pl.Accelerator": + def accelerator(self) -> "pl.accelerators.accelerator.Accelerator": return self._accelerator @accelerator.setter - def accelerator(self, accelerator: "pl.Accelerator") -> None: + def accelerator(self, accelerator: "pl.accelerators.accelerator.Accelerator") -> None: self._accelerator = accelerator @property From 19bcf3f97b98b1d816f114ed5db8149ab12da959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 18:18:41 +0100 Subject: [PATCH 34/42] spacing in flow diagram --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index bed5c1d4f9938..0d0ec111d6adb 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1125,7 +1125,7 @@ def _run( {Trainer.fit} or {Trainer.test} or {Trainer.predict} || | || spawn processes || - {self.training_type_plugin.setup_environment} || + {self.training_type_plugin.setup_environment} || | || setup accelerator || and strategy || LIGHTNING From 5862afe371588605be20936c8d3c905b1af35385 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Wed, 15 Dec 2021 09:22:09 -0800 Subject: [PATCH 35/42] remove todo comments --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 691dbd81e2cb3..cd84351e60b56 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -1021,8 +1021,6 @@ def _check_plugin_compatibility(self) -> None: Raises: ValueError: If an invalid combination of Accelerator, TrainingTypePlugin, PrecisionPlugin is found. """ - # TODO moved from TPUAccelerator when refactor accelerator. Revisit when refactor - # accelerator_connector @four4fish if isinstance(self.accelerator, TPUAccelerator): if not isinstance(self.training_type_plugin.precision_plugin, TPUPrecisionPlugin): raise ValueError( From 2dfc4432c34006d2453fdb8408d3fe7f21c486e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 18:22:52 +0100 Subject: [PATCH 36/42] docformatter --- .../trainer/connectors/accelerator_connector.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index cd84351e60b56..0154bc94acac1 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -996,13 +996,14 @@ def _set_distrib_type_if_training_type_plugin_passed(self): self._distrib_type = getattr(self._training_type_plugin, "distributed_backend", None) def _is_slurm_managing_tasks(self) -> bool: - """Returns whether we let SLURM manage the processes or not. Returns ``True`` if and only if these - conditions match: + """Returns whether we let SLURM manage the processes or not. - - A SLURM cluster is detected - - A distributed plugin is being used - - The process is not launching in interactive mode - - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer + Returns ``True`` if and only if these conditions match: + + - A SLURM cluster is detected + - A distributed plugin is being used + - The process is not launching in interactive mode + - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer """ if ( (not self.use_ddp and not self.use_ddp2) From 92cc26223e0f0cad95a267e17cdd94b4e7343186 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 18:26:41 +0100 Subject: [PATCH 37/42] Update pytorch_lightning/plugins/training_type/training_type_plugin.py --- pytorch_lightning/plugins/training_type/training_type_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index b0af138a460ff..bfdb01577aafe 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -43,7 +43,7 @@ class TrainingTypePlugin(ABC): def __init__( self, - accelerator: Optional["pl.accelerators.accelerator.Accelerators.accelerator.Accelerator"] = None, + accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ) -> None: From ff3e2dce377507dd7cd0af8ec3e9ecdfbcb99e27 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Wed, 15 Dec 2021 10:09:31 -0800 Subject: [PATCH 38/42] revert test changes --- tests/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 2b42c4d62848f..0ef10b4eb2a9f 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -415,7 +415,7 @@ class Accel(Accelerator): class Prec(PrecisionPlugin): pass - class TrainTypePlugin(DDPPlugin): + class TrainTypePlugin(SingleDevicePlugin): pass ttp = TrainTypePlugin(device=torch.device("cpu"), accelerator=Accel(), precision_plugin=Prec()) From 9f1eadeb7a787ca3de4048fb6e0cdf09db3504e1 Mon Sep 17 00:00:00 2001 From: Siyu Wang Date: Wed, 15 Dec 2021 11:24:25 -0800 Subject: [PATCH 39/42] improve custom plugin examples --- docs/source/extensions/plugins.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 0c9a6bbbbbc1c..f791df894d0c8 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -81,7 +81,9 @@ can then be passed into the Trainer directly or via a (custom) accelerator: # fully custom accelerator and plugins accelerator = MyAccelerator() - trainer = Trainer(accelerator=accelerator) + precision_plugin = MyPrecisionPlugin() + training_type_plugin = CustomDDPPlugin(accelerator=accelerator, precision_plugin=precision_plugin) + trainer = Trainer(strategy=training_type_plugin) The full list of built-in plugins is listed below. From a74f4c1b57492fd15dac0378801832e1932b019e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 16 Dec 2021 02:08:59 +0100 Subject: [PATCH 40/42] remove redundant call to ttp attribute it is no longer a property --- pytorch_lightning/trainer/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0d0ec111d6adb..ed45c8b3797d2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -451,7 +451,6 @@ def __init__( amp_level, plugins, ) - self._accelerator_connector.training_type_plugin self.logger_connector: LoggerConnector = LoggerConnector(self, log_gpu_memory) self._callback_connector = CallbackConnector(self) self.checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint) From 292c6402c3e208123e2cfc9b49f48b1e5d832112 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Dec 2021 01:44:27 +0000 Subject: [PATCH 41/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/accelerators/test_cpu.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py index 1ceaad5d1183b..2ef234b1ffde6 100644 --- a/tests/accelerators/test_cpu.py +++ b/tests/accelerators/test_cpu.py @@ -17,9 +17,7 @@ def test_restore_checkpoint_after_pre_dispatch_default(): """Assert default for restore_checkpoint_after_pre_dispatch is False.""" plugin = SingleDevicePlugin( - accelerator=CPUAccelerator(), - device=torch.device("cpu"), - precision_plugin=PrecisionPlugin() + accelerator=CPUAccelerator(), device=torch.device("cpu"), precision_plugin=PrecisionPlugin() ) assert not plugin.restore_checkpoint_after_pre_dispatch @@ -55,7 +53,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: accelerator=CPUAccelerator(), precision_plugin=PrecisionPlugin(), device=torch.device("cpu"), - checkpoint_io=TorchCheckpointIO() + checkpoint_io=TorchCheckpointIO(), ) assert plugin.restore_checkpoint_after_pre_dispatch == restore_after_pre_dispatch From 448b5244a89cc2324be6bf08e059b34d1f59d1aa Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Wed, 15 Dec 2021 18:38:21 -0800 Subject: [PATCH 42/42] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ed45c8b3797d2..b0ef15e33a46e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -451,7 +451,7 @@ def __init__( amp_level, plugins, ) - self.logger_connector: LoggerConnector = LoggerConnector(self, log_gpu_memory) + self.logger_connector = LoggerConnector(self, log_gpu_memory) self._callback_connector = CallbackConnector(self) self.checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint) self.signal_connector = SignalConnector(self)