From 6ef051c09b05ba22722cfe7b1a0fae9f01618940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 19 Jan 2023 18:44:16 +0100 Subject: [PATCH 1/2] Remove the FairScale integration --- docs/source-pytorch/api_references.rst | 5 - docs/source-pytorch/conf.py | 1 - docs/source-pytorch/extensions/plugins.rst | 2 - docs/source-pytorch/guides/speed.rst | 2 +- .../pytorch/check-avail-strategies.py | 1 - requirements/pytorch/strategies.txt | 1 - .../components/multi_node/trainer.py | 1 - src/pytorch_lightning/CHANGELOG.md | 8 + .../callbacks/stochastic_weight_avg.py | 4 +- src/pytorch_lightning/overrides/fairscale.py | 42 --- src/pytorch_lightning/plugins/__init__.py | 4 - .../plugins/precision/__init__.py | 4 - .../precision/fully_sharded_native_amp.py | 42 --- .../plugins/precision/sharded_native_amp.py | 53 --- .../serve/servable_module_validator.py | 3 +- src/pytorch_lightning/strategies/__init__.py | 3 - src/pytorch_lightning/strategies/ddp.py | 6 - .../strategies/fully_sharded.py | 313 --------------- src/pytorch_lightning/strategies/sharded.py | 146 ------- .../strategies/sharded_spawn.py | 121 ------ .../connectors/accelerator_connector.py | 12 - .../components/multi_node/test_trainer.py | 3 - tests/tests_pytorch/helpers/runif.py | 11 - .../precision/test_sharded_precision.py | 43 --- .../plugins/test_cluster_integration.py | 27 +- ..._ddp_fully_sharded_with_full_state_dict.py | 253 ------------- .../tests_pytorch/strategies/test_registry.py | 35 +- .../strategies/test_sharded_strategy.py | 356 ------------------ .../connectors/test_accelerator_connector.py | 51 +-- tests/tests_pytorch/trainer/test_trainer.py | 67 +--- tests/tests_pytorch/utilities/test_imports.py | 9 +- 31 files changed, 44 insertions(+), 1585 deletions(-) delete mode 100644 src/pytorch_lightning/overrides/fairscale.py delete mode 100644 src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py delete mode 100644 src/pytorch_lightning/plugins/precision/sharded_native_amp.py delete mode 100644 src/pytorch_lightning/strategies/fully_sharded.py delete mode 100644 src/pytorch_lightning/strategies/sharded.py delete mode 100644 src/pytorch_lightning/strategies/sharded_spawn.py delete mode 100644 tests/tests_pytorch/plugins/precision/test_sharded_precision.py delete mode 100644 tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py delete mode 100644 tests/tests_pytorch/strategies/test_sharded_strategy.py diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index c722af960212b..f7e9efb18cd45 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -176,13 +176,11 @@ precision ColossalAIPrecisionPlugin DeepSpeedPrecisionPlugin DoublePrecisionPlugin - FullyShardedNativeMixedPrecisionPlugin FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin PrecisionPlugin - ShardedNativeMixedPrecisionPlugin TPUBf16PrecisionPlugin TPUPrecisionPlugin @@ -276,9 +274,6 @@ strategies BaguaStrategy ColossalAIStrategy DDPFullyShardedNativeStrategy - DDPFullyShardedStrategy - DDPShardedStrategy - DDPSpawnShardedStrategy DDPSpawnStrategy DDPStrategy DataParallelStrategy diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index 80a055f3c9bef..bfb9e80e0c44f 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -294,7 +294,6 @@ def _transform_changelog(path_in: str, path_out: str) -> None: "numpy": ("https://numpy.org/doc/stable/", None), "PIL": ("https://pillow.readthedocs.io/en/stable/", None), "torchmetrics": ("https://torchmetrics.readthedocs.io/en/stable/", None), - "fairscale": ("https://fairscale.readthedocs.io/en/latest/", None), "graphcore": ("https://docs.graphcore.ai/en/latest/", None), } diff --git a/docs/source-pytorch/extensions/plugins.rst b/docs/source-pytorch/extensions/plugins.rst index 560c26a3e1cda..b9f21a8ad1610 100644 --- a/docs/source-pytorch/extensions/plugins.rst +++ b/docs/source-pytorch/extensions/plugins.rst @@ -55,13 +55,11 @@ The full list of built-in precision plugins is listed below. ColossalAIPrecisionPlugin DeepSpeedPrecisionPlugin DoublePrecisionPlugin - FullyShardedNativeMixedPrecisionPlugin FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin PrecisionPlugin - ShardedNativeMixedPrecisionPlugin TPUBf16PrecisionPlugin TPUPrecisionPlugin diff --git a/docs/source-pytorch/guides/speed.rst b/docs/source-pytorch/guides/speed.rst index 10b957729904c..b95309c9a8a44 100644 --- a/docs/source-pytorch/guides/speed.rst +++ b/docs/source-pytorch/guides/speed.rst @@ -28,7 +28,7 @@ GPU Training Lightning supports a variety of plugins to speed up distributed GPU training. Most notably: * :class:`~pytorch_lightning.strategies.DDPStrategy` -* :class:`~pytorch_lightning.strategies.DDPShardedStrategy` +* :class:`~pytorch_lightning.strategies.DDPFullyShardedNativeStrategy` * :class:`~pytorch_lightning.strategies.DeepSpeedStrategy` .. code-block:: python diff --git a/requirements/pytorch/check-avail-strategies.py b/requirements/pytorch/check-avail-strategies.py index eb3b66b989401..94bb9b924b769 100644 --- a/requirements/pytorch/check-avail-strategies.py +++ b/requirements/pytorch/check-avail-strategies.py @@ -1,4 +1,3 @@ if __name__ == "__main__": import bagua # noqa: F401 import deepspeed # noqa: F401 - import fairscale # noqa: F401 diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 4de4dc15f51b0..4010828b7df13 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -2,5 +2,4 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment # colossalai>=0.1.10 # TODO: uncomment when there's a stable version released -fairscale>=0.4.5, <0.4.13 deepspeed>=0.6.0, <=0.7.0 diff --git a/src/lightning_app/components/multi_node/trainer.py b/src/lightning_app/components/multi_node/trainer.py index e3f738abad329..e3da755bfcb40 100644 --- a/src/lightning_app/components/multi_node/trainer.py +++ b/src/lightning_app/components/multi_node/trainer.py @@ -40,7 +40,6 @@ def run( try: pkg = importlib.import_module(pkg_name) trainers.append(pkg.Trainer) - strategies.append(pkg.strategies.DDPSpawnShardedStrategy) strategies.append(pkg.strategies.DDPSpawnStrategy) mps_accelerators.append(pkg.accelerators.MPSAccelerator) except (ImportError, ModuleNotFoundError): diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5455ce9099869..2a2f15ec8ac2e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -44,6 +44,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed `Trainer(strategy='horovod')` support ([#16150](https://github.com/Lightning-AI/lightning/pull/16150)) +- `FairScale` removal (in favor of PyTorch's FSDP implementation) ([#TODO](https://github.com/PyTorchLightning/pytorch-lightning/pull/TODO)) + * Removed the `pytorch_lightning.overrides.fairscale.LightningShardedDataParallel` class + * Removed the `pytorch_lightning.plugins.precision.fully_sharded_native_amp.FullyShardedNativeMixedPrecisionPlugin` class + * Removed the `pytorch_lightning.plugins.precision.sharded_native_amp.ShardedNativeMixedPrecisionPlugin` class + * Removed the `pytorch_lightning.strategies.fully_sharded.DDPFullyShardedStrategy` (fsdp) class + * Removed the `pytorch_lightning.strategies.sharded.DDPShardedStrategy` (ddp_sharded) class + * Removed the `pytorch_lightning.strategies.sharded_spawn.DDPSpawnShardedStrategy` (ddp_sharded_spawn) class + - Removed legacy device arguments in Trainer ([#16171](https://github.com/Lightning-AI/lightning/pull/16171)) * Removed the `Trainer(gpus=...)` argument * Removed the `Trainer(tpu_cores=...)` argument diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index c7705775bc267..1a347cd202abc 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -25,7 +25,7 @@ import pytorch_lightning as pl from lightning_fabric.utilities.types import LRScheduler from pytorch_lightning.callbacks.callback import Callback -from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy +from pytorch_lightning.strategies import DeepSpeedStrategy from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn @@ -146,7 +146,7 @@ def pl_module_contains_batch_norm(pl_module: "pl.LightningModule") -> bool: return any(isinstance(module, nn.modules.batchnorm._BatchNorm) for module in pl_module.modules()) def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: - if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy, DeepSpeedStrategy)): + if isinstance(trainer.strategy, (DDPFullyShardedNativeStrategy, DeepSpeedStrategy)): raise MisconfigurationException("SWA does not currently support sharded models.") # copy the model before moving it to accelerator device. diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py deleted file mode 100644 index 93b100f9e3135..0000000000000 --- a/src/pytorch_lightning/overrides/fairscale.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import List - -from lightning_utilities.core.imports import package_available -from torch.optim import Optimizer - -from lightning_fabric.plugins import Precision -from lightning_fabric.utilities.imports import _IS_WINDOWS - -_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and package_available("fairscale") - -if _FAIRSCALE_AVAILABLE: - from fairscale.optim import OSS -else: - OSS = object - - -def _reinit_optimizers_with_oss(optimizers: List[Optimizer], precision: Precision, num_nodes: int) -> List["OSS"]: - for x, optimizer in enumerate(optimizers): - if not isinstance(optimizer, OSS): - optim_class = type(optimizer) - zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults) - is_fp16 = precision.precision == "16" - # For multi-node training, compressing the model shards in fp16 before broadcasting - # improves performance. When using PyTorch AMP, it will not degrade - # the model performance. - zero_optimizer.broadcast_fp16 = is_fp16 and num_nodes > 1 - optimizers[x] = zero_optimizer - del optimizer - return optimizers diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index a9c393a0dd07e..769b0d9199214 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -8,12 +8,10 @@ from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin @@ -33,8 +31,6 @@ "HPUPrecisionPlugin", "MixedPrecisionPlugin", "PrecisionPlugin", - "ShardedNativeMixedPrecisionPlugin", - "FullyShardedNativeMixedPrecisionPlugin", "FullyShardedNativeNativeMixedPrecisionPlugin", "TPUPrecisionPlugin", "TPUBf16PrecisionPlugin", diff --git a/src/pytorch_lightning/plugins/precision/__init__.py b/src/pytorch_lightning/plugins/precision/__init__.py index d200d1c2f3fb8..85e8c7586c89f 100644 --- a/src/pytorch_lightning/plugins/precision/__init__.py +++ b/src/pytorch_lightning/plugins/precision/__init__.py @@ -15,12 +15,10 @@ from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin @@ -29,12 +27,10 @@ "DeepSpeedPrecisionPlugin", "DoublePrecisionPlugin", "FullyShardedNativeNativeMixedPrecisionPlugin", - "FullyShardedNativeMixedPrecisionPlugin", "HPUPrecisionPlugin", "IPUPrecisionPlugin", "MixedPrecisionPlugin", "PrecisionPlugin", - "ShardedNativeMixedPrecisionPlugin", "TPUPrecisionPlugin", "TPUBf16PrecisionPlugin", ] diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py deleted file mode 100644 index 904d61f4dffc3..0000000000000 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Any - -from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): - """Native AMP for Fully Sharded Training.""" - - def __init__(self, *args: Any, **kwargs: Any) -> None: - rank_zero_deprecation( - "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" - " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." - " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" - " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" - " the native version by default." - ) - super().__init__(*args, **kwargs) - - def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: - # see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html - # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect - # for FSDP module. To overcome this, needs to call sharded_module.clip_grad_norm(clip_val) - # however we rely on LightningModule's configure_sharded_model to wrap FSDP, it would be hard to - # trace back the root FSDP. Now we only support clip by value. - raise MisconfigurationException( - f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" - ) diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py deleted file mode 100644 index f4f646b4239a2..0000000000000 --- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Optional, Union - -from typing_extensions import Literal - -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - -if _FAIRSCALE_AVAILABLE: - from fairscale.optim import OSS - from fairscale.optim.grad_scaler import ShardedGradScaler -else: - OSS = ShardedGradScaler = object - - -class ShardedNativeMixedPrecisionPlugin(MixedPrecisionPlugin): - """Native AMP for Sharded Training.""" - - def __init__( - self, precision: Literal["16", 16, "bf16"], device: str, scaler: Optional[ShardedGradScaler] = None - ) -> None: - rank_zero_deprecation( - "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" - " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." - " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" - " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" - " the native version by default." - ) - if not _FAIRSCALE_AVAILABLE: - raise MisconfigurationException( - "You have asked for sharded AMP but you have not installed it." - " Install `fairscale` using this guide: https://https://github.com/facebookresearch/fairscale" - ) - super().__init__( - precision, device, scaler=(ShardedGradScaler() if scaler is None and str(precision) == "16" else None) - ) - - def clip_grad_by_norm(self, optimizer: "OSS", clip_val: Union[int, float]) -> None: - optimizer.clip_grad_norm(clip_val) diff --git a/src/pytorch_lightning/serve/servable_module_validator.py b/src/pytorch_lightning/serve/servable_module_validator.py index c3aed93daa570..f654c8a8ab32a 100644 --- a/src/pytorch_lightning/serve/servable_module_validator.py +++ b/src/pytorch_lightning/serve/servable_module_validator.py @@ -11,7 +11,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks import Callback from pytorch_lightning.serve.servable_module import ServableModule -from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy, DDPFullyShardedStrategy, DeepSpeedStrategy +from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_only @@ -19,7 +19,6 @@ _NOT_SUPPORTED_STRATEGIES = ( DeepSpeedStrategy, DDPFullyShardedNativeStrategy, - DDPFullyShardedStrategy, ) _logger = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/strategies/__init__.py b/src/pytorch_lightning/strategies/__init__.py index dcfb11eecb3a2..2807fafbeff6d 100644 --- a/src/pytorch_lightning/strategies/__init__.py +++ b/src/pytorch_lightning/strategies/__init__.py @@ -18,13 +18,10 @@ from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy # noqa: F401 from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy # noqa: F401 from pytorch_lightning.strategies.dp import DataParallelStrategy # noqa: F401 -from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy # noqa: F401 from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy # noqa: F401 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy # noqa: F401 from pytorch_lightning.strategies.ipu import IPUStrategy # noqa: F401 from pytorch_lightning.strategies.parallel import ParallelStrategy # noqa: F401 -from pytorch_lightning.strategies.sharded import DDPShardedStrategy # noqa: F401 -from pytorch_lightning.strategies.sharded_spawn import DDPSpawnShardedStrategy # noqa: F401 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy # noqa: F401 from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy # noqa: F401 from pytorch_lightning.strategies.single_tpu import SingleTPUStrategy # noqa: F401 diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 5c5db3bcfbd5b..52209ee72d97f 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -39,7 +39,6 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy @@ -49,10 +48,6 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep -if _FAIRSCALE_AVAILABLE: - from fairscale.optim import OSS -else: - OSS = object if torch.distributed.is_available(): from torch.distributed.algorithms.model_averaging.averagers import ModelAverager @@ -230,7 +225,6 @@ def _enable_model_averaging(self) -> None: if ( is_distributed_optimizer or isinstance(optimizer, ZeroRedundancyOptimizer) - or (_FAIRSCALE_AVAILABLE and isinstance(optimizer, OSS)) or isinstance(optimizer, PostLocalSGDOptimizer) ): raise ValueError( diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py deleted file mode 100644 index 534fdf8dbbe32..0000000000000 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import logging -from typing import Any, Dict, Generator, List, Optional - -import torch -from torch.optim import Optimizer - -import pytorch_lightning as pl -from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment -from lightning_fabric.utilities.optimizer import _optimizers_to_device -from pytorch_lightning.overrides.base import _LightningModuleWrapperBase -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins.precision import PrecisionPlugin -from pytorch_lightning.strategies.ddp import DDPStrategy -from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation -from pytorch_lightning.utilities.types import STEP_OUTPUT - -if _FAIRSCALE_AVAILABLE: - from fairscale.nn import default_auto_wrap_policy, enable_wrap - from fairscale.nn.data_parallel import FullyShardedDataParallel -else: - FullyShardedDataParallel = None - -log = logging.getLogger(__name__) - - -class _DDPFullyShardedStrategyModuleWrapper(_LightningModuleWrapperBase): - def state_dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: # type: ignore[override] - # this is required because with FSDP lightning_module is empty because weights are sharded. - # So we need to call self.trainer.model.state_dict (wrapped version) and use this wraper to - # avoid extra keys `_forward_module.layer.weight.` since we want `layer.weight.` in state_dict. - return self._forward_module.state_dict(*args, **kwargs) - - -class DDPFullyShardedStrategy(DDPStrategy): - - strategy_name = "ddp_fully_sharded" - - def __init__( - self, - accelerator: Optional["pl.accelerators.Accelerator"] = None, - cpu_offload: bool = False, - flatten_parameters: bool = True, - reshard_after_forward: bool = True, - move_grads_to_cpu: Optional[bool] = None, - fp32_reduce_scatter: Optional[bool] = None, - compute_dtype: Optional[torch.dtype] = None, - bucket_cap_mb: int = 25, - min_num_params: int = 100_000_000, - state_dict_to_cpu: bool = True, - parallel_devices: Optional[List[torch.device]] = None, - cluster_environment: Optional[ClusterEnvironment] = None, - checkpoint_io: Optional[CheckpointIO] = None, - precision_plugin: Optional[PrecisionPlugin] = None, - process_group_backend: Optional[str] = None, - ): - """Plugin for Fully Sharded Data Parallel provided by FairScale. - - .. warning:: ``DDPFullyShardedStrategy`` is in beta and subject to change. - - Full Sharded Training shards the entire model across all available GPUs, allowing you to scale model - size, whilst using efficient communication to reduce overhead. In practice, this means we can remain - at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar - to ZeRO-Stage 3 but has been built for upstreaming to PyTorch. - - For more information - `check out FairScale's docs `__. - - Defaults have been set and options have been exposed, but may require configuration - based on your level of memory/speed efficiency. We suggest having a look at - `this PR for more information `__. - - Many of the helpful doc strings below came from the original - `FairScale documentation `__. - - Arguments: - cpu_offload: Offload FP32 params to CPU. Only usable in precision=16 mode. - (Default: False). - move_grads_to_cpu: Moves gradient shards to CPU after reduction. - Only disable if using CPU based optimizers - (Default to ``cpu_offload``). - flatten_parameters: Flattens parameter into single contiguous tensor for speed efficiency - (Default: True). - reshard_after_forward: Reshard parameters after the forward pass, which saves memory but slows - down training. This is only relevant when resharding individual layers. - (Default: True). - fp32_reduce_scatter: Reduce-Scatter gradients in FP32. Only relevant in mixed precision - (Default: None). - compute_dtype: dtype for full parameters for computation. Default to torch.float32, - unless using mixed precision, in which case defaults to torch.float16. - (Default: None). - bucket_cap_mb: bucket parameters so that gradient reduction - can potentially overlap with backward computation. - bucket_cap_mb controls the bucket size in MegaBytes (MB). - Buckets are sub-divided based on world_size, - so the max shard size is roughly bucket_cap_mb / world_size. - Values <= 0 disable bucketing. - (Default: 25). - min_num_params: Number of parameters to wrap when using FairScale ``auto_wrap``. - (Default: 1e8) - state_dict_to_cpu: Whether to return parameters (returned by :func:`state_dict`) on CPU device. - If ``False``, this will default to ``compute_device``. - (Default: True). - """ - rank_zero_deprecation( - "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" - " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." - " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" - " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" - " the native version by default." - ) - super().__init__( - accelerator=accelerator, - parallel_devices=parallel_devices, - cluster_environment=cluster_environment, - checkpoint_io=checkpoint_io, - precision_plugin=precision_plugin, - process_group_backend=process_group_backend, - ) - self.cpu_offload = cpu_offload - self.move_grads_to_cpu = move_grads_to_cpu - self.flatten_parameters = flatten_parameters - self.reshard_after_forward = reshard_after_forward - self.fp32_reduce_scatter = fp32_reduce_scatter - self.compute_dtype = compute_dtype - self.bucket_cap_mb = bucket_cap_mb - self.min_num_params = min_num_params - self.state_dict_device = torch.device("cpu") if state_dict_to_cpu else None - self._process_group = None - - @property - def process_group(self) -> Any: - if self._process_group is None: - self._process_group = torch.distributed.new_group() - return self._process_group - - def lightning_module_state_dict(self) -> Dict[str, Any]: - """Returns model state.""" - assert self.model is not None - return self.model.state_dict() - - def connect(self, model: "pl.LightningModule") -> None: - """Called by the accelerator to connect the accelerator and the model with this plugin.""" - # TODO: Wait for this issue to resolve and remove this blocker - # https://github.com/facebookresearch/fairscale/issues/648 - # Also make sure to update the tests - if not is_overridden("configure_sharded_model", self.lightning_module) and len(list(model.parameters())) == 0: - assert self.lightning_module is not None - raise MisconfigurationException( - f"Using the same instance of model with `trainer.{self.lightning_module.trainer.state.fn}()` is not" - " supported with Fairscale FSDP auto-wrap. Please reinitialize your `LightningModule` and pass that." - ) - - super().connect(model) - - def setup_distributed(self) -> None: - if not self.root_device.type == "cuda": - raise MisconfigurationException( - "You selected strategy to be `ddp_fully_sharded`, but GPU is not available." - ) - super().setup_distributed() - - def setup(self, trainer: "pl.Trainer") -> None: - assert self.accelerator - self.accelerator.setup(trainer) - - if trainer.state.fn == TrainerFn.FITTING: - if self._layer_sync: - assert self.model - self.model = self._layer_sync.apply(self.model) - - self.configure_ddp() - assert isinstance(self.model, pl.LightningModule) - self.model = _DDPFullyShardedStrategyModuleWrapper(self.model) - assert self.lightning_module is not None - if not is_overridden("configure_sharded_model", self.lightning_module): - self.model = self._setup_model(self.model) - self.setup_optimizers(self.lightning_module.trainer) - _optimizers_to_device(self.optimizers, self.root_device) - self.barrier() - - self.setup_precision_plugin() - - def setup_optimizers(self, trainer: "pl.Trainer") -> None: - invalid_params_error = False - try: - super().setup_optimizers(trainer) - except ValueError as e: - if "optimizer got an empty parameter list" not in str(e): - raise - invalid_params_error = True - - if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): - raise ValueError( - "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" - " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" - " `configure_optimizers()` hook." - ) - - def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel: - """Wraps the model into a - :class:`~fairscale.nn.data_parallel.fully_sharded_data_parallel.FullyShardedDataParallel` module.""" - log.detail(f"setting up `Fairscale FSDP` model with device id: {self.root_device.index}.") - - return FullyShardedDataParallel( - module=model, - process_group=self.process_group, - cpu_offload=self.cpu_offload, - move_grads_to_cpu=self.move_grads_to_cpu, - flatten_parameters=self.flatten_parameters, - mixed_precision=(self.precision_plugin.precision == "16"), - reshard_after_forward=self.reshard_after_forward, - fp32_reduce_scatter=self.fp32_reduce_scatter, - compute_dtype=self.compute_dtype, - bucket_cap_mb=self.bucket_cap_mb, - state_dict_device=self.state_dict_device, - ) - - @contextlib.contextmanager - def model_sharded_context(self) -> Generator: - log.detail(f"{self.__class__.__name__}: entered model_sharded_context.") - precision = self.precision_plugin.precision - - def wrap_policy(*args: Any, **kwargs: Any) -> Any: - return default_auto_wrap_policy(*args, **kwargs, min_num_params=self.min_num_params) - - with enable_wrap( - wrapper_cls=FullyShardedDataParallel, - auto_wrap_policy=wrap_policy, - process_group=self.process_group, - cpu_offload=self.cpu_offload, - move_grads_to_cpu=self.move_grads_to_cpu, - flatten_parameters=self.flatten_parameters, - mixed_precision=(precision == "16"), - reshard_after_forward=self.reshard_after_forward, - fp32_reduce_scatter=self.fp32_reduce_scatter, - compute_dtype=self.compute_dtype, - bucket_cap_mb=self.bucket_cap_mb, - state_dict_device=self.state_dict_device, - ): - yield - - log.detail(f"{self.__class__.__name__}: exiting model_sharded_context.") - - def configure_ddp(self) -> None: - log.detail(f"{self.__class__.__name__}: configuring FSDP... (cpu_offload: [{self.cpu_offload}])") - if not self.cpu_offload: - # When using CPU Offload, FSDP will manage the CUDA movement for us. - # Note: this would be problematic for large model (which could not fit in one GPU) - # as FSDP module.to(device) would first summon all parameters - # (TODO: need to figure out solution) - self.model_to_device() - - def model_to_device(self) -> None: - log.detail(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...") - # ensure we update the device type in the lightning module - assert self.lightning_module - self.lightning_module.to(self.root_device) - - def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: - # we don't need precision context since casting is done by FSDP - # read `mixed_precision` docstring here: https://pytorch.org/docs/stable/fsdp.html - assert self.model is not None - return self.model(*args, **kwargs) - - def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: - assert self.model is not None - return self.model(*args, **kwargs) - - def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: - assert self.model is not None - return self.model(*args, **kwargs) - - def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: - assert self.model is not None - return self.model(*args, **kwargs) - - def post_training_step(self) -> None: - pass - - @classmethod - def register_strategies(cls, strategy_registry: Dict) -> None: - strategy_registry.register( - "fsdp", cls, description="Fully sharded training with checkpointing the full state dict." - ) - - strategy_registry.register( - cls.strategy_name, - cls, - description=f"{cls.__class__.__name__}", - ) - - -def _optimizer_has_flat_params(optimizer: Optimizer) -> bool: - from fairscale.nn.misc.flatten_params_wrapper import FlatParameter - - return any(isinstance(param, FlatParameter) for param in optimizer.param_groups[0]["params"]) diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py deleted file mode 100644 index bc7b56b7c142d..0000000000000 --- a/src/pytorch_lightning/strategies/sharded.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from contextlib import contextmanager -from typing import Any, Dict, Generator, List, Tuple - -from torch import Tensor -from torch.nn import Module -from torch.optim import Optimizer - -import pytorch_lightning as pl -from lightning_fabric.utilities.optimizer import _optimizers_to_device -from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _reinit_optimizers_with_oss -from pytorch_lightning.strategies.ddp import DDPStrategy -from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - -if _FAIRSCALE_AVAILABLE: - from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel - from fairscale.optim import OSS -else: - OSS = ShardedDataParallel = object - - -class DDPShardedStrategy(DDPStrategy): - """Optimizer and gradient sharded training provided by FairScale.""" - - strategy_name = "ddp_sharded" - _REDUCE_BUFFER_SIZE_DEFAULT: int = 2**23 # 8M - - def __init__(self, *args: Any, **kwargs: Any) -> None: - rank_zero_deprecation( - "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" - " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." - " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" - " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" - " the native version by default." - ) - super().__init__(*args, **kwargs) - - def connect(self, model: "pl.LightningModule") -> None: - if not _FAIRSCALE_AVAILABLE: # pragma: no cover - raise MisconfigurationException( - "`DDPShardedStrategy` requires `fairscale` to be installed." - " Install it by running `pip install fairscale`." - ) - return super().connect(model) - - def setup(self, trainer: "pl.Trainer") -> None: - assert self.accelerator is not None - self.accelerator.setup(trainer) - - # move the model to the correct device - self.model_to_device() - - # skip wrapping the model if we are not fitting as no gradients need to be exchanged - trainer_fn = trainer.state.fn - if trainer_fn == TrainerFn.FITTING: - if self._layer_sync: - assert self.model is not None - self.model = self._layer_sync.apply(self.model) - - self.setup_precision_plugin() - - if trainer_fn == TrainerFn.FITTING: - self.configure_ddp() - - def configure_ddp(self) -> None: - self._set_ddp_kwargs() - assert self.lightning_module is not None - self.setup_optimizers(self.lightning_module.trainer) - assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) - self.model, self.optimizers = self._setup_model_and_optimizers( - model=_LightningModuleWrapperBase(self.model), - optimizers=self.optimizers, - ) - _optimizers_to_device(self.optimizers, self.root_device) - - def _set_ddp_kwargs(self) -> None: - if "reduce_buffer_size" not in self._ddp_kwargs: - # For multi-node training, enabling bucketing will improve performance. - self._ddp_kwargs["reduce_buffer_size"] = self._REDUCE_BUFFER_SIZE_DEFAULT if self.num_nodes > 1 else 0 - - def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]: - """Wraps the model and optimizers with fairscale components. - - Return: - The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module - and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`. - """ - optimizers = self._wrap_optimizers(optimizers) - model = ShardedDataParallel(model, sharded_optimizer=optimizers, **self._ddp_kwargs) - return model, optimizers - - def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]: - assert self.lightning_module is not None - if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING: - return optimizers - optimizers = [o._optimizer if isinstance(o, LightningOptimizer) else o for o in optimizers] - return _reinit_optimizers_with_oss(optimizers, self.precision_plugin, self.num_nodes) - - def pre_backward(self, closure_loss: Tensor) -> None: - pass - - @contextmanager - def block_backward_sync(self) -> Generator: - """Blocks syncing gradients behaviour on backwards pass. - - This is useful for skipping sync when accumulating gradients, reducing communication overhead - Returns: context manager with sync behaviour off - """ - if isinstance(self.model, ShardedDataParallel): - with self.model.no_sync(): - yield None - else: - yield None - - def post_training_step(self) -> None: - pass - - @classmethod - def register_strategies(cls, strategy_registry: Dict) -> None: - strategy_registry.register( - "ddp_sharded_find_unused_parameters_false", - cls, - description="DDP Sharded Strategy with `find_unused_parameters` as False", - find_unused_parameters=False, - ) - strategy_registry.register( - cls.strategy_name, - cls, - description=f"{cls.__class__.__name__}", - ) diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py deleted file mode 100644 index 74fb1f4026ec0..0000000000000 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from contextlib import contextmanager -from typing import Any, Dict, Generator, List, Tuple - -from torch import Tensor -from torch.nn import Module -from torch.optim import Optimizer - -import pytorch_lightning as pl -from lightning_fabric.utilities.optimizer import _optimizers_to_device -from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _reinit_optimizers_with_oss -from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy -from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - -if _FAIRSCALE_AVAILABLE: - from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel - from fairscale.optim import OSS - -else: - OSS = ShardedDataParallel = object - - -class DDPSpawnShardedStrategy(DDPSpawnStrategy): - """Optimizer sharded training provided by FairScale.""" - - strategy_name = "ddp_sharded_spawn" - - def __init__(self, *args: Any, **kwargs: Any) -> None: - rank_zero_deprecation( - "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be" - " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead." - " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses" - " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use" - " the native version by default." - ) - super().__init__(*args, **kwargs) - - def connect(self, model: "pl.LightningModule") -> None: - if not _FAIRSCALE_AVAILABLE: # pragma: no cover - raise MisconfigurationException( - "`DDPSpawnShardedStrategy` requires `fairscale` to be installed." - " Install it by running `pip install fairscale`." - ) - return super().connect(model) - - def configure_ddp(self) -> None: - # set up optimizers after the wrapped module has been moved to the device - assert self.lightning_module is not None - self.setup_optimizers(self.lightning_module.trainer) - assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) - self.model, self.optimizers = self._setup_model_and_optimizers( - model=_LightningModuleWrapperBase(self.model), optimizers=self.optimizers - ) - _optimizers_to_device(self.optimizers, self.root_device) - - def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]: - """Wraps the model and optimizers with fairscale components. - - Return: - The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module - and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`. - """ - optimizers = self._wrap_optimizers(optimizers) - model = ShardedDataParallel(model, sharded_optimizer=optimizers, **self._ddp_kwargs) - return model, optimizers - - def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]: - assert self.lightning_module - if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING: - return optimizers - optimizers = [o._optimizer if isinstance(o, LightningOptimizer) else o for o in optimizers] - return _reinit_optimizers_with_oss(optimizers, self.precision_plugin, self.num_nodes) - - @contextmanager - def block_backward_sync(self) -> Generator: - """Blocks syncing gradients behaviour on backwards pass. - - This is useful for skipping sync when accumulating gradients, reducing communication overhead - Returns: context manager with sync behaviour off - """ - if isinstance(self.model, ShardedDataParallel): - with self.model.no_sync(): - yield None - else: - yield None - - def pre_backward(self, closure_loss: Tensor) -> None: - pass - - def post_training_step(self) -> None: - pass - - @classmethod - def register_strategies(cls, strategy_registry: Dict) -> None: - strategy_registry.register( - "ddp_sharded_spawn_find_unused_parameters_false", - cls, - description="DDP Spawn Sharded Strategy with `find_unused_parameters` as False", - find_unused_parameters=False, - ) - strategy_registry.register( - cls.strategy_name, - cls, - description=f"{cls.__class__.__name__}", - ) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 6e23d22d2f315..53e7dd227480e 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -42,13 +42,11 @@ ColossalAIPrecisionPlugin, DeepSpeedPrecisionPlugin, DoublePrecisionPlugin, - FullyShardedNativeMixedPrecisionPlugin, HPUPrecisionPlugin, IPUPrecisionPlugin, MixedPrecisionPlugin, PLUGIN_INPUT, PrecisionPlugin, - ShardedNativeMixedPrecisionPlugin, TPUBf16PrecisionPlugin, TPUPrecisionPlugin, ) @@ -58,9 +56,6 @@ from pytorch_lightning.strategies import ( ColossalAIStrategy, DDPFullyShardedNativeStrategy, - DDPFullyShardedStrategy, - DDPShardedStrategy, - DDPSpawnShardedStrategy, DDPSpawnStrategy, DDPStrategy, DeepSpeedStrategy, @@ -580,12 +575,8 @@ def _check_and_init_precision(self) -> PrecisionPlugin: ) device = "cpu" if self._accelerator_flag == "cpu" else "cuda" - if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): - return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) if isinstance(self.strategy, DDPFullyShardedNativeStrategy): return FullyShardedNativeNativeMixedPrecisionPlugin(self._precision_flag, device) - if isinstance(self.strategy, DDPFullyShardedStrategy): - return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) return MixedPrecisionPlugin(self._precision_flag, device) raise RuntimeError("No precision set") @@ -670,10 +661,7 @@ def is_distributed(self) -> bool: return self.strategy.is_distributed distributed_strategy = ( DDPStrategy, - DDPSpawnShardedStrategy, - DDPShardedStrategy, DDPFullyShardedNativeStrategy, - DDPFullyShardedStrategy, DDPSpawnStrategy, DeepSpeedStrategy, TPUSpawnStrategy, diff --git a/tests/tests_app/components/multi_node/test_trainer.py b/tests/tests_app/components/multi_node/test_trainer.py index 7cd28a0e09992..be6a48430d099 100644 --- a/tests/tests_app/components/multi_node/test_trainer.py +++ b/tests/tests_app/components/multi_node/test_trainer.py @@ -94,6 +94,3 @@ def test_trainer_run_executor_arguments_choices( def test_trainer_run_executor_invalid_strategy_instances(): with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."): _, _ = _get_args_after_tracer_injection(strategy=pl.strategies.DDPSpawnStrategy()) - - with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."): - _, _ = _get_args_after_tracer_injection(strategy=pl.strategies.DDPSpawnShardedStrategy()) diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py index 87f5b91f48f1f..9368ea20ad21e 100644 --- a/tests/tests_pytorch/helpers/runif.py +++ b/tests/tests_pytorch/helpers/runif.py @@ -24,7 +24,6 @@ from pytorch_lightning.accelerators.mps import MPSAccelerator from pytorch_lightning.accelerators.tpu import TPUAccelerator from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.strategies.colossalai import _COLOSSALAI_AVAILABLE from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE @@ -62,7 +61,6 @@ def __new__( mps: Optional[bool] = None, skip_windows: bool = False, standalone: bool = False, - fairscale: bool = False, deepspeed: bool = False, rich: bool = False, omegaconf: bool = False, @@ -90,7 +88,6 @@ def __new__( skip_windows: Skip for Windows platform. standalone: Mark the test as standalone, our CI will run it in a separate process. This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set. - fairscale: Require that facebookresearch/fairscale is installed. deepspeed: Require that microsoft/DeepSpeed is installed. rich: Require that willmcgugan/rich is installed. omegaconf: Require that omry/omegaconf is installed. @@ -179,14 +176,6 @@ def __new__( # used in conftest.py::pytest_collection_modifyitems kwargs["standalone"] = True - if fairscale: - if skip_windows: - raise ValueError( - "`skip_windows` is not necessary when `fairscale` is set as it does not support Windows." - ) - conditions.append(not _FAIRSCALE_AVAILABLE) - reasons.append("Fairscale") - if deepspeed: conditions.append(not _DEEPSPEED_AVAILABLE) reasons.append("Deepspeed") diff --git a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py deleted file mode 100644 index e040523c1e9c9..0000000000000 --- a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import torch - -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin -from tests_pytorch.helpers.runif import RunIf - -ShardedGradScaler = None -if _FAIRSCALE_AVAILABLE: - from fairscale.optim.grad_scaler import ShardedGradScaler - - -@RunIf(fairscale=True) -@pytest.mark.parametrize( - "precision,scaler,expected", - [ - (16, torch.cuda.amp.GradScaler(), torch.cuda.amp.GradScaler), - (16, None, ShardedGradScaler), - ("bf16", None, None), - (32, None, None), - ], -) -def test_sharded_precision_scaler(precision, scaler, expected): - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - plugin = ShardedNativeMixedPrecisionPlugin(precision=precision, scaler=scaler, device="cuda") - if expected: - assert isinstance(plugin.scaler, expected) - else: - assert not plugin.scaler diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index 8a96bd8fdd90c..368aec30e34a8 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -19,7 +19,7 @@ from lightning_fabric.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment from pytorch_lightning import Trainer -from pytorch_lightning.strategies import DDPShardedStrategy, DDPStrategy, DeepSpeedStrategy +from pytorch_lightning.strategies import DDPStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests_pytorch.helpers.runif import RunIf @@ -59,23 +59,17 @@ def environment_combinations(): @RunIf(mps=False) @pytest.mark.parametrize( "strategy_cls", - [DDPStrategy, DDPShardedStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))], + [DDPStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))], ) @mock.patch("pytorch_lightning.accelerators.cuda.CUDAAccelerator.is_available", return_value=True) def test_ranks_available_manual_strategy_selection(_, strategy_cls): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 - for i, (cluster, variables, expected) in enumerate(environment_combinations()): + for cluster, variables, expected in environment_combinations(): with mock.patch.dict(os.environ, variables): - if strategy_cls is DDPShardedStrategy and i == 0: - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - strategy = strategy_cls( - parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster - ) - else: - strategy = strategy_cls( - parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster - ) + strategy = strategy_cls( + parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster + ) trainer = Trainer(strategy=strategy, num_nodes=num_nodes) assert rank_zero_only.rank == expected["global_rank"] assert trainer.global_rank == expected["global_rank"] @@ -89,7 +83,6 @@ def test_ranks_available_manual_strategy_selection(_, strategy_cls): "trainer_kwargs", [ dict(strategy="ddp", accelerator="gpu", devices=[1, 2]), - dict(strategy="ddp_sharded", accelerator="gpu", devices=[1, 2]), dict(strategy="ddp_spawn", accelerator="cpu", devices=2), dict(strategy="ddp_spawn", accelerator="gpu", devices=[1, 2]), ], @@ -99,7 +92,7 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar num_nodes = 2 trainer_kwargs.update(num_nodes=num_nodes) - for i, (cluster, variables, expected) in enumerate(environment_combinations()): + for cluster, variables, expected in environment_combinations(): if trainer_kwargs["strategy"] == "ddp_spawn": if isinstance(cluster, (SLURMEnvironment, TorchElasticEnvironment)): # slurm and torchelastic do not work with spawn strategies @@ -108,11 +101,7 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar expected.update(global_rank=(expected["node_rank"] * 2), local_rank=0) with mock.patch.dict(os.environ, variables): - if "sharded" in trainer_kwargs["strategy"] and i == 0: - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(**trainer_kwargs) - else: - trainer = Trainer(**trainer_kwargs) + trainer = Trainer(**trainer_kwargs) assert type(trainer.strategy.cluster_environment) is type(cluster) assert rank_zero_only.rank == expected["global_rank"] assert trainer.global_rank == expected["global_rank"] diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py deleted file mode 100644 index a60c4d8cb8ecf..0000000000000 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -from typing import Any, Dict, Optional -from unittest import mock - -import pytest -import torch - -from pytorch_lightning import Trainer -from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin -from pytorch_lightning.strategies import DDPFullyShardedStrategy -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.runif import RunIf - -if _FAIRSCALE_AVAILABLE: - from fairscale.nn import FullyShardedDataParallel, wrap - - -class TestFSDPModelManualWrapped(BoringModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.layer: Optional[torch.nn.Module] = None - - def _init_model(self) -> None: - self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2)) - - def setup(self, stage: str) -> None: - if self.layer is None: - self._init_model() - - def configure_sharded_model(self) -> None: - # the model is already wrapped with FSDP: no need to wrap again! - if isinstance(self.layer, FullyShardedDataParallel): - return - for i, layer in enumerate(self.layer): - if i % 2 == 0: - self.layer[i] = wrap(layer) - self.layer = wrap(self.layer) - - def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: - # when loading full state dict, we first need to create a new unwrapped model - self._init_model() - - def configure_optimizers(self): - return torch.optim.SGD(self.layer.parameters(), lr=0.1) - - def on_train_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def on_test_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def on_validation_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def on_prediction_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def _assert_layer_fsdp_instance(self) -> None: - assert isinstance(self.layer, FullyShardedDataParallel) - assert isinstance(self.layer.module[0], FullyShardedDataParallel) - assert isinstance(self.layer.module[2], FullyShardedDataParallel) - - # Assert that the nested layers are set reshard_after_forward to True - assert self.layer.module[0].reshard_after_forward - assert self.layer.module[2].reshard_after_forward - - if isinstance(self.trainer.precision_plugin, FullyShardedNativeMixedPrecisionPlugin): - assert self.layer.mixed_precision - assert self.layer.module[0].mixed_precision - assert self.layer.module[2].mixed_precision - - -class TestFSDPModelAutoWrapped(BoringModel): - def __init__(self): - super().__init__() - self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2)) - - def configure_optimizers(self): - return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1) - - def on_train_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def on_test_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def on_validation_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def on_prediction_batch_end(self, *_, **__) -> None: - self._assert_layer_fsdp_instance() - - def _assert_layer_fsdp_instance(self) -> None: - assert isinstance(self.trainer.model, FullyShardedDataParallel) - # `disable_reshard_on_root=True` (default) in FSDP which turns-off resharding - assert not self.trainer.model.reshard_after_forward - - if isinstance(self.trainer.precision_plugin, FullyShardedNativeMixedPrecisionPlugin): - assert self.trainer.model.mixed_precision - - -def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModelManualWrapped): - # Use FullySharded to get the state dict for the sake of comparison - model_state_dict = trainer.strategy.lightning_module_state_dict() - - if trainer.is_global_zero: - saved_model = cls.load_from_checkpoint(ckpt_path) - - # Assert model parameters are identical after loading - for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()): - assert torch.equal(ddp_param.float().cpu(), shard_param) - - -def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): - trainer.fit(model) - - model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path - - trainer.save_checkpoint(model_path, weights_only=True) - - _assert_save_equality(trainer, model_path, cls=model.__class__) - - # Test entry point - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.test(model) # model is wrapped, will not call configure_shared_model - - # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.test(model, ckpt_path=model_path) - - # Predict entry point - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.predict(model) # model is wrapped, will not call `configure_sharded_model` - - # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap - if model.__class__ is TestFSDPModelAutoWrapped: - model = TestFSDPModelAutoWrapped() - trainer.predict(model, ckpt_path=model_path) - - -def test_invalid_on_cpu(tmpdir): - """Test to ensure that to raise Misconfiguration for FSDP on CPU.""" - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp") - assert isinstance(trainer.strategy, DDPFullyShardedStrategy) - with pytest.raises( - MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available." - ): - trainer.strategy.setup_environment() - - -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@RunIf(fairscale=True) -def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir): - """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16 - ) - assert isinstance(trainer.strategy, DDPFullyShardedStrategy) - assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin) - - -@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) -def test_fully_sharded_strategy_checkpoint(tmpdir): - """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" - - model = TestFSDPModelManualWrapped() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - default_root_dir=tmpdir, - accelerator="gpu", - devices=1, - strategy="fsdp", - precision=16, - max_epochs=1, - enable_progress_bar=False, - enable_model_summary=False, - ) - _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) - - -@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) -def test_fsdp_gradient_clipping_raises(tmpdir): - """Test to ensure that an exception is raised when clipping gradients by value with FSDP.""" - model = TestFSDPModelManualWrapped() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - default_root_dir=tmpdir, - strategy="fsdp", - fast_dev_run=True, - accelerator="gpu", - devices=1, - precision=16, - gradient_clip_val=1, - gradient_clip_algorithm="norm", - enable_progress_bar=False, - enable_model_summary=False, - ) - with pytest.raises( - MisconfigurationException, match="gradient_clip_algorithm='norm'` is currently not supported for `FullySharded" - ): - trainer.fit(model) - - -@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) -def test_fsdp_rewrap_limitation(tmpdir): - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - default_root_dir=tmpdir, - accelerator="gpu", - devices=1, - max_steps=1, - limit_val_batches=0, - limit_test_batches=1, - strategy="fsdp", - ) - model = TestFSDPModelAutoWrapped() - trainer.fit(model) - - with pytest.raises(MisconfigurationException, match="Using the same instance of model .* not supported"): - trainer.test(model) - - -@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) -def test_invalid_parameters_in_optimizer(): - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1) - - class EmptyParametersModel(BoringModel): - def configure_optimizers(self): - return torch.optim.Adam(self.parameters(), lr=1e-2) - - model = EmptyParametersModel() - with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): - trainer.fit(model) - - class NoFlatParametersModel(BoringModel): - def configure_optimizers(self): - layer = torch.nn.Linear(4, 5) - return torch.optim.Adam(layer.parameters(), lr=1e-2) - - model = NoFlatParametersModel() - with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"): - trainer.fit(model) diff --git a/tests/tests_pytorch/strategies/test_registry.py b/tests/tests_pytorch/strategies/test_registry.py index 39e10a05fc328..9e249e4f14a43 100644 --- a/tests/tests_pytorch/strategies/test_registry.py +++ b/tests/tests_pytorch/strategies/test_registry.py @@ -16,9 +16,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.plugins import CheckpointIO from pytorch_lightning.strategies import ( - DDPFullyShardedStrategy, - DDPShardedStrategy, - DDPSpawnShardedStrategy, + DDPFullyShardedNativeStrategy, DDPSpawnStrategy, DDPStrategy, DeepSpeedStrategy, @@ -67,17 +65,14 @@ def test_tpu_spawn_debug_strategy_registry(xla_available): assert isinstance(trainer.strategy, TPUSpawnStrategy) -def test_fsdp_strategy_registry(tmpdir): - - strategy = "fsdp" - +@RunIf(min_torch="1.12") +def test_fsdp_strategy_registry(cuda_count_1): + strategy = "fsdp_native" assert strategy in StrategyRegistry - assert StrategyRegistry[strategy]["strategy"] == DDPFullyShardedStrategy - - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy=strategy) + assert StrategyRegistry[strategy]["strategy"] == DDPFullyShardedNativeStrategy - assert isinstance(trainer.strategy, DDPFullyShardedStrategy) + trainer = Trainer(accelerator="cuda", strategy=strategy) + assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy) @pytest.mark.parametrize( @@ -105,24 +100,10 @@ def test_fsdp_strategy_registry(tmpdir): {"find_unused_parameters": False, "start_method": "fork"}, marks=RunIf(skip_windows=True), ), - ( - "ddp_sharded_spawn_find_unused_parameters_false", - DDPSpawnShardedStrategy, - {"find_unused_parameters": False}, - ), - ( - "ddp_sharded_find_unused_parameters_false", - DDPShardedStrategy, - {"find_unused_parameters": False}, - ), ], ) def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params): - if "sharded" in strategy_name: - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) - else: - trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) + trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) assert isinstance(trainer.strategy, strategy) assert strategy_name in StrategyRegistry assert StrategyRegistry[strategy_name]["init_params"] == expected_init_params diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py deleted file mode 100644 index 29fd4607c521b..0000000000000 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ /dev/null @@ -1,356 +0,0 @@ -import os -from copy import deepcopy -from typing import Mapping -from unittest import mock -from unittest.mock import Mock - -import pytest -import torch -from torch import Tensor - -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins import MixedPrecisionPlugin -from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy -from pytorch_lightning.trainer.states import TrainerFn -from tests_pytorch.helpers.runif import RunIf - -if _FAIRSCALE_AVAILABLE: - from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel - from fairscale.optim import OSS - - -class ModelWithAdamOptimizer(BoringModel): - def configure_optimizers(self): - optimizer = torch.optim.Adam(self.layer.parameters(), lr=0.1) - return optimizer - - -class CheckModelRestore(ModelWithAdamOptimizer): - def __init__(self, old_model_state_dict, old_optimizer_states): - super().__init__() - self.old_model_state_dict = old_model_state_dict - self.old_optimizer_states = old_optimizer_states - - def on_train_start(self): - assert all( - self._is_equal(actual, expected) for actual, expected in zip(self.state_dict(), self.old_model_state_dict) - ) - - for optimizer, state in zip(self.trainer.optimizers, self.old_optimizer_states): - optimizer_state = self.trainer.strategy.optimizer_state(optimizer) - self._is_equal(optimizer_state, state) - - def _is_equal(self, a, b): - if isinstance(a, Tensor): - return torch.allclose(a, b) - - if isinstance(a, Mapping): - return all(self._is_equal(a.get(k, None), b.get(k, None)) for k in b.keys()) - - return a == b - - -@pytest.mark.parametrize("clip_val", [0, 10]) -@RunIf(min_cuda_gpus=1, fairscale=True) -@mock.patch("fairscale.optim.oss.OSS.clip_grad_norm") -def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_val, tmpdir): - """Ensure that clip gradients is only called if the value is greater than 0.""" - model = BoringModel() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - default_root_dir=tmpdir, - strategy="ddp_sharded", - accelerator="gpu", - devices=1, - precision=16, - fast_dev_run=True, - gradient_clip_val=clip_val, - ) - trainer.fit(model) - if clip_val > 0: - mock_oss_clip_grad_norm.assert_called() - else: - mock_oss_clip_grad_norm.assert_not_called() - - -@RunIf(fairscale=True) -@pytest.mark.parametrize( - "strategy,expected", [("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPSpawnShardedStrategy)] -) -def test_sharded_ddp_choice(strategy, expected): - """Test to ensure that strategy is correctly chosen.""" - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(fast_dev_run=True, strategy=strategy) - assert isinstance(trainer.strategy, expected) - - -@RunIf(min_cuda_gpus=1, fairscale=True) -@pytest.mark.parametrize( - "strategy,expected", [("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPSpawnShardedStrategy)] -) -def test_ddp_choice_sharded_amp(strategy, expected): - """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=1, precision=16, strategy=strategy) - assert isinstance(trainer.strategy, expected) - assert isinstance(trainer.precision_plugin, MixedPrecisionPlugin) - - -@RunIf(fairscale=True) -def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir): - """Test to ensure that checkpoint is saved correctly.""" - model = BoringModel() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) - - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - trainer.save_checkpoint(checkpoint_path) - saved_model = BoringModel.load_from_checkpoint(checkpoint_path) - - # Assert model parameters are identical after loading - for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): - assert torch.equal(trained_param.to("cpu"), loaded_param) - - -@RunIf(min_cuda_gpus=2, fairscale=True) -def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): - """Test to ensure that checkpoint is saved correctly when using multiple GPUs.""" - model = BoringModel() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) - - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - trainer.save_checkpoint(checkpoint_path) - saved_model = BoringModel.load_from_checkpoint(checkpoint_path) - - # Assert model parameters are identical after loading - for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()): - assert torch.equal(trained_param.to("cpu"), loaded_param) - - -@RunIf(min_cuda_gpus=2, fairscale=True) -def test_ddp_sharded_strategy_finetune(tmpdir): - """Test to ensure that we can save and restart training (simulate fine-tuning)""" - model = BoringModel() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True) - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - trainer.save_checkpoint(checkpoint_path) - saved_model = BoringModel.load_from_checkpoint(checkpoint_path) - - trainer = Trainer(fast_dev_run=True) - trainer.fit(saved_model) - - -@RunIf(fairscale=True) -def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir): - """Test to ensure that resuming from checkpoint works.""" - model = BoringModel() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) - - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - trainer.save_checkpoint(checkpoint_path) - - model = BoringModel() - - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) - - trainer.fit(model, ckpt_path=checkpoint_path) - - -@RunIf(min_cuda_gpus=1, fairscale=True) -def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir): - """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU.""" - model = BoringModel() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=1, fast_dev_run=True) - - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - trainer.save_checkpoint(checkpoint_path) - - model = BoringModel() - - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True) - - trainer.fit(model, ckpt_path=checkpoint_path) - - -@RunIf(standalone=True, fairscale=True) -@pytest.mark.parametrize( - "trainer_kwargs", - ( - dict(accelerator="cpu", devices=2), - pytest.param(dict(accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)), - ), -) -def test_ddp_sharded_strategy_test_multigpu(trainer_kwargs): - """Test to ensure we can use validate and test without fit.""" - model = BoringModel() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - strategy="ddp_sharded_spawn", - fast_dev_run=True, - enable_progress_bar=False, - enable_model_summary=False, - **trainer_kwargs, - ) - - trainer.validate(model) - trainer.test(model) - - -class BoringModelSharded(BoringModel): - def on_train_start(self) -> None: - """Check if trainer module is wrapped as ShardedDataParallel during training stage.""" - assert isinstance(self.trainer.model, ShardedDataParallel) - - def on_test_start(self) -> None: - """Check if trainer module remains as LightningModule during test stage.""" - assert isinstance(self.trainer.model, LightningModule) - - def on_validation_start(self) -> None: - """Check if trainer module remains as LightningModule during test stage.""" - if self.trainer.state.fn == TrainerFn.FITTING: - assert isinstance(self.trainer.model, ShardedDataParallel) - else: - assert isinstance(self.trainer.model, LightningModule) - - def on_predict_start(self) -> None: - """Check if trainer module remains as LightningModule during prediction stage.""" - assert isinstance(self.trainer.model, LightningModule) - - -@RunIf(fairscale=True) -def test_configure_ddp(tmpdir): - """Tests with ddp sharded strategy.""" - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True) - - model = BoringModelSharded() - - trainer.fit(model) - trainer.test(model, dataloaders=model.test_dataloader()) - trainer.validate(model, dataloaders=model.val_dataloader()) - trainer.predict(model, dataloaders=model.predict_dataloader()) - - -@RunIf(fairscale=True) -@mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True) -@pytest.mark.parametrize("cls", [DDPShardedStrategy, DDPSpawnShardedStrategy]) -def test_custom_kwargs_sharded(_, cls): - """Tests to ensure that if custom kwargs are passed, they are set correctly.""" - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - strategy = cls(reduce_fp16=True) - strategy._lightning_module = Mock(spec=LightningModule) - strategy._lightning_module.trainer = Mock() - strategy.parallel_devices = [Mock()] - class_name = "sharded" if isinstance(strategy, DDPShardedStrategy) else "sharded_spawn" - - with mock.patch(f"pytorch_lightning.strategies.{class_name}.ShardedDataParallel", autospec=True) as mock_sharded: - strategy.configure_ddp() - args, kwargs = mock_sharded.call_args - assert "reduce_fp16" in kwargs - assert kwargs["reduce_fp16"] - - -@RunIf(fairscale=True) -@mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True) -@pytest.mark.parametrize(["params", "expected_buffer_size"], [(dict(), 0), (dict(reduce_buffer_size=128), 128)]) -@pytest.mark.parametrize("num_nodes", [1, 2]) -def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_size, num_nodes): - """Tests to ensure that ``reduce_buffer_size`` is correctly set based on user kwargs.""" - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - strategy = DDPShardedStrategy(**params) - strategy.num_nodes = num_nodes - strategy._lightning_module = Mock(spec=LightningModule) - strategy._lightning_module.trainer = Mock() - strategy.parallel_devices = [Mock()] - - with mock.patch("pytorch_lightning.strategies.sharded.ShardedDataParallel", autospec=True) as mock_sharded: - strategy.configure_ddp() - args, kwargs = mock_sharded.call_args - assert "reduce_buffer_size" in kwargs - - if num_nodes > 1 and len(params) == 0: - # If user has not specified a buffer size and we're using multiple nodes, check to see if default is set - assert kwargs["reduce_buffer_size"] == DDPShardedStrategy._REDUCE_BUFFER_SIZE_DEFAULT - else: - assert kwargs["reduce_buffer_size"] == expected_buffer_size - - -@RunIf(fairscale=True) -def test_block_backward_sync(): - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - strategy = DDPShardedStrategy() - model = mock.MagicMock(spec=ShardedDataParallel) - with mock.patch.object(strategy, "_model", model): - with strategy.block_backward_sync(): - pass - model.no_sync.assert_called_once() - - -@pytest.mark.parametrize( - "strategy_name,expected_ddp_kwargs", - [ - ("ddp_sharded", {}), - ("ddp_sharded_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_sharded_spawn", {}), - ("ddp_sharded_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), - ], -) -def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy=strategy_name) - assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs - - -class BoringFairScaleOptimizerModel(BoringModel): - def configure_optimizers(self): - base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults) - - -@RunIf(min_cuda_gpus=2, fairscale=True) -def test_ddp_sharded_strategy_fit_ckpt_path_downsize_gpus(tmpdir): - model = ModelWithAdamOptimizer() - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - strategy="ddp_sharded_spawn", - max_epochs=1, - limit_train_batches=1, - limit_val_batches=0, - accelerator="gpu", - devices=2, - ) - trainer.fit(model) - - checkpoint_path = trainer.checkpoint_callback.best_model_path - ckpt = torch.load(checkpoint_path) - old_model_state_dict = deepcopy(ckpt["state_dict"]) - old_optimizer_states = deepcopy(ckpt["optimizer_states"]) - - model = CheckModelRestore(old_model_state_dict, old_optimizer_states) - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - strategy="ddp_sharded_spawn", - max_epochs=2, - limit_train_batches=1, - limit_val_batches=0, - accelerator="gpu", - devices=1, - ) - trainer.fit(model, ckpt_path=checkpoint_path) diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 2248c14fee75c..6d0f02250cf25 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -39,8 +39,6 @@ from pytorch_lightning.strategies import ( DataParallelStrategy, DDPFullyShardedNativeStrategy, - DDPShardedStrategy, - DDPSpawnShardedStrategy, DDPSpawnStrategy, DDPStrategy, DeepSpeedStrategy, @@ -241,11 +239,6 @@ def test_interactive_incompatible_backend_error(cuda_count_2, monkeypatch): with pytest.raises(MisconfigurationException, match=r"strategy='ddp_spawn'\)`.*is not compatible"): Trainer(strategy="ddp_spawn", accelerator="gpu", devices=2) - with pytest.raises( - MisconfigurationException, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible" - ), pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=2) - with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): # Edge case: AcceleratorConnector maps dp to ddp if accelerator != gpu Trainer(strategy="dp") @@ -277,20 +270,12 @@ def test_interactive_compatible_strategy_ddp_fork(monkeypatch): [ ("ddp", DDPStrategy), ("ddp_spawn", DDPSpawnStrategy), - ("ddp_sharded", DDPShardedStrategy), - ("ddp_sharded_spawn", DDPSpawnShardedStrategy), pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), ], ) @pytest.mark.parametrize("devices", [1, 2]) def test_accelerator_choice_multi_node_gpu(cuda_count_2, tmpdir, strategy, strategy_class, devices): - if "sharded" in strategy: - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer( - default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices - ) - else: - trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) + trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) assert isinstance(trainer.strategy, strategy_class) @@ -380,23 +365,15 @@ def test_exception_invalid_strategy(): ("ddp", DDPStrategy), ("ddp_find_unused_parameters_false", DDPStrategy), ("dp", DataParallelStrategy), - ("ddp_sharded", DDPShardedStrategy), - ("ddp_sharded_spawn", DDPSpawnShardedStrategy), pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), ), ) @pytest.mark.parametrize("accelerator", ["mps", "auto", "gpu", None, MPSAccelerator()]) def test_invalid_ddp_strategy_with_mps(accelerator, strategy, strategy_class, mps_count_1, cuda_count_0): - if "sharded" in strategy: - with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): - Trainer(accelerator=accelerator, strategy=strategy) - else: - with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): - Trainer(accelerator=accelerator, strategy=strategy) - - with pytest.raises(ValueError, match="strategies from the DDP family are not supported"), pytest.deprecated_call( - match="FairScale has been deprecated in v1.9.0" - ): + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + Trainer(accelerator=accelerator, strategy=strategy) + + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): Trainer(accelerator="mps", strategy=strategy_class()) @@ -420,7 +397,6 @@ def test_strategy_choice_cpu_instance(strategy_class): assert isinstance(trainer.strategy, strategy_class) -@RunIf(min_cuda_gpus=2) @pytest.mark.parametrize( ["strategy", "strategy_class"], [ @@ -429,31 +405,22 @@ def test_strategy_choice_cpu_instance(strategy_class): ("ddp", DDPStrategy), ("ddp_find_unused_parameters_false", DDPStrategy), ("dp", DataParallelStrategy), - ("ddp_sharded", DDPShardedStrategy), - ("ddp_sharded_spawn", DDPSpawnShardedStrategy), pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), ], ) -def test_strategy_choice_gpu_str(strategy, strategy_class): - if "sharded" in strategy: - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2) - else: - trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2) +def test_strategy_choice_gpu_str(strategy, strategy_class, cuda_count_2): + trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2) assert isinstance(trainer.strategy, strategy_class) -@RunIf(min_cuda_gpus=2) @pytest.mark.parametrize("strategy_class", [DDPSpawnStrategy, DDPStrategy]) -def test_strategy_choice_gpu_instance(strategy_class): +def test_strategy_choice_gpu_instance(strategy_class, cuda_count_2, mps_count_0): trainer = Trainer(strategy=strategy_class(), accelerator="gpu", devices=2) assert isinstance(trainer.strategy, strategy_class) -@RunIf(min_cuda_gpus=2) @pytest.mark.parametrize("strategy_class", [DDPSpawnStrategy, DDPStrategy]) -def test_device_type_when_strategy_instance_gpu_passed(strategy_class): - +def test_device_type_when_strategy_instance_gpu_passed(strategy_class, cuda_count_2, mps_count_0): trainer = Trainer(strategy=strategy_class(), accelerator="gpu", devices=2) assert isinstance(trainer.strategy, strategy_class) assert isinstance(trainer.accelerator, CUDAAccelerator) diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 81bbe0f9f2e15..941e5eafdf7e6 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -51,15 +51,7 @@ ) from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler -from pytorch_lightning.strategies import ( - DataParallelStrategy, - DDPFullyShardedStrategy, - DDPShardedStrategy, - DDPSpawnShardedStrategy, - DDPSpawnStrategy, - DDPStrategy, - SingleDeviceStrategy, -) +from pytorch_lightning.strategies import DataParallelStrategy, DDPSpawnStrategy, DDPStrategy, SingleDeviceStrategy from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE @@ -2012,13 +2004,6 @@ def training_step(self, batch, batch_idx): CPUAccelerator, 1, ), - ( - {"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1}, - DDPFullyShardedStrategy, - "ddp_fully_sharded", - CUDAAccelerator, - 1, - ), ( {"strategy": DDPSpawnStrategy(), "accelerator": "cpu", "devices": 2}, DDPSpawnStrategy, @@ -2042,20 +2027,6 @@ def training_step(self, batch, batch_idx): CUDAAccelerator, 2, ), - ( - {"strategy": DDPFullyShardedStrategy(), "accelerator": "cuda", "devices": 2}, - DDPFullyShardedStrategy, - "ddp_fully_sharded", - CUDAAccelerator, - 2, - ), - ( - {"strategy": DDPSpawnShardedStrategy(), "accelerator": "cuda", "devices": 2}, - DDPSpawnShardedStrategy, - "ddp_sharded_spawn", - CUDAAccelerator, - 2, - ), ( {"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2}, DDPSpawnStrategy, @@ -2063,39 +2034,13 @@ def training_step(self, batch, batch_idx): CUDAAccelerator, 2, ), - ( - {"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1, "num_nodes": 2}, - DDPFullyShardedStrategy, - "ddp_fully_sharded", - CUDAAccelerator, - 1, - ), - ( - {"strategy": "ddp_sharded", "accelerator": "cuda", "devices": 2, "num_nodes": 2}, - DDPShardedStrategy, - "ddp_sharded", - CUDAAccelerator, - 2, - ), - ( - {"strategy": "ddp_sharded_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2}, - DDPSpawnShardedStrategy, - "ddp_sharded_spawn", - CUDAAccelerator, - 2, - ), ], ) def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, strategy_name, accelerator_cls, devices): if trainer_kwargs.get("accelerator") == "cuda": mock_cuda_count(monkeypatch, trainer_kwargs["devices"]) - strategy = trainer_kwargs.get("strategy") - if (isinstance(strategy, str) and "sharded" in strategy) or isinstance(strategy, (DDPShardedStrategy)): - with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"): - trainer = Trainer(**trainer_kwargs) - else: - trainer = Trainer(**trainer_kwargs) + trainer = Trainer(**trainer_kwargs) assert isinstance(trainer.strategy, strategy_cls) assert strategy_cls.strategy_name == strategy_name @@ -2191,7 +2136,7 @@ def on_fit_start(self): logger.finalize.assert_called_once_with("failed") -# TODO: replace with 1.14 when it is released +# TODO: replace with 2.0 when it is released @RunIf(min_torch="1.14.0.dev20221202") def test_trainer_compiled_model(): model = BoringModel() @@ -2219,11 +2164,9 @@ def test_trainer_compiled_model(): model = torch.compile(model) - trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=1, strategy=DDPShardedStrategy) - + trainer = Trainer(fast_dev_run=True, strategy="fsdp_native") with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"): trainer.fit(model) - trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=1, strategy=DDPStrategy) - + trainer = Trainer(fast_dev_run=True, strategy="ddp") trainer.fit(model) diff --git a/tests/tests_pytorch/utilities/test_imports.py b/tests/tests_pytorch/utilities/test_imports.py index cc24904265cf6..ff1b3d5a463e4 100644 --- a/tests/tests_pytorch/utilities/test_imports.py +++ b/tests/tests_pytorch/utilities/test_imports.py @@ -20,7 +20,7 @@ from unittest import mock import pytest -from lightning_utilities.core.imports import compare_version, module_available, RequirementCache +from lightning_utilities.core.imports import compare_version, RequirementCache from torch.distributed import is_available from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE @@ -108,18 +108,13 @@ def clean_import(): _shortcut_patch(RequirementCache.__bool__, ("jsonargparse[signatures]>=4.12.0",), ("requirement",)), "pytorch_lightning.cli", ), - ( - "lightning_utilities.core.imports.module_available", - _shortcut_patch(module_available, ("fairscale.nn",)), - "pytorch_lightning.strategies", - ), ( "lightning_utilities.core.imports.compare_version", _shortcut_patch(compare_version, ("torch", operator.ge, "1.12.0")), "pytorch_lightning.strategies.fully_sharded_native", ), ], - ids=["ProcessGroup", "neptune", "cli", "fairscale", "fully_sharded_native"], + ids=["ProcessGroup", "neptune", "cli", "fully_sharded_native"], ) def test_import_with_unavailable_dependencies(patch_name, new_fn, to_import, clean_import): """This tests simulates unavailability of certain modules by patching the functions that check for their From 668096946d7c462d99c2b976bb60ff8ecf2d8803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 20 Jan 2023 11:11:23 +0100 Subject: [PATCH 2/2] Update src/pytorch_lightning/CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index a800ee853d123..0ad48ba732ca1 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -44,7 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed `Trainer(strategy='horovod')` support ([#16150](https://github.com/Lightning-AI/lightning/pull/16150)) -- `FairScale` removal (in favor of PyTorch's FSDP implementation) ([#TODO](https://github.com/PyTorchLightning/pytorch-lightning/pull/TODO)) +- `FairScale` removal (in favor of PyTorch's FSDP implementation) ([#16400](https://github.com/PyTorchLightning/pytorch-lightning/pull/16400)) * Removed the `pytorch_lightning.overrides.fairscale.LightningShardedDataParallel` class * Removed the `pytorch_lightning.plugins.precision.fully_sharded_native_amp.FullyShardedNativeMixedPrecisionPlugin` class * Removed the `pytorch_lightning.plugins.precision.sharded_native_amp.ShardedNativeMixedPrecisionPlugin` class