From 6ef051c09b05ba22722cfe7b1a0fae9f01618940 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 19 Jan 2023 18:44:16 +0100
Subject: [PATCH 1/2] Remove the FairScale integration

---
 docs/source-pytorch/api_references.rst        |   5 -
 docs/source-pytorch/conf.py                   |   1 -
 docs/source-pytorch/extensions/plugins.rst    |   2 -
 docs/source-pytorch/guides/speed.rst          |   2 +-
 .../pytorch/check-avail-strategies.py         |   1 -
 requirements/pytorch/strategies.txt           |   1 -
 .../components/multi_node/trainer.py          |   1 -
 src/pytorch_lightning/CHANGELOG.md            |   8 +
 .../callbacks/stochastic_weight_avg.py        |   4 +-
 src/pytorch_lightning/overrides/fairscale.py  |  42 ---
 src/pytorch_lightning/plugins/__init__.py     |   4 -
 .../plugins/precision/__init__.py             |   4 -
 .../precision/fully_sharded_native_amp.py     |  42 ---
 .../plugins/precision/sharded_native_amp.py   |  53 ---
 .../serve/servable_module_validator.py        |   3 +-
 src/pytorch_lightning/strategies/__init__.py  |   3 -
 src/pytorch_lightning/strategies/ddp.py       |   6 -
 .../strategies/fully_sharded.py               | 313 ---------------
 src/pytorch_lightning/strategies/sharded.py   | 146 -------
 .../strategies/sharded_spawn.py               | 121 ------
 .../connectors/accelerator_connector.py       |  12 -
 .../components/multi_node/test_trainer.py     |   3 -
 tests/tests_pytorch/helpers/runif.py          |  11 -
 .../precision/test_sharded_precision.py       |  43 ---
 .../plugins/test_cluster_integration.py       |  27 +-
 ..._ddp_fully_sharded_with_full_state_dict.py | 253 -------------
 .../tests_pytorch/strategies/test_registry.py |  35 +-
 .../strategies/test_sharded_strategy.py       | 356 ------------------
 .../connectors/test_accelerator_connector.py  |  51 +--
 tests/tests_pytorch/trainer/test_trainer.py   |  67 +---
 tests/tests_pytorch/utilities/test_imports.py |   9 +-
 31 files changed, 44 insertions(+), 1585 deletions(-)
 delete mode 100644 src/pytorch_lightning/overrides/fairscale.py
 delete mode 100644 src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
 delete mode 100644 src/pytorch_lightning/plugins/precision/sharded_native_amp.py
 delete mode 100644 src/pytorch_lightning/strategies/fully_sharded.py
 delete mode 100644 src/pytorch_lightning/strategies/sharded.py
 delete mode 100644 src/pytorch_lightning/strategies/sharded_spawn.py
 delete mode 100644 tests/tests_pytorch/plugins/precision/test_sharded_precision.py
 delete mode 100644 tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
 delete mode 100644 tests/tests_pytorch/strategies/test_sharded_strategy.py

diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst
index c722af960212b..f7e9efb18cd45 100644
--- a/docs/source-pytorch/api_references.rst
+++ b/docs/source-pytorch/api_references.rst
@@ -176,13 +176,11 @@ precision
     ColossalAIPrecisionPlugin
     DeepSpeedPrecisionPlugin
     DoublePrecisionPlugin
-    FullyShardedNativeMixedPrecisionPlugin
     FullyShardedNativeNativeMixedPrecisionPlugin
     HPUPrecisionPlugin
     IPUPrecisionPlugin
     MixedPrecisionPlugin
     PrecisionPlugin
-    ShardedNativeMixedPrecisionPlugin
     TPUBf16PrecisionPlugin
     TPUPrecisionPlugin
 
@@ -276,9 +274,6 @@ strategies
     BaguaStrategy
     ColossalAIStrategy
     DDPFullyShardedNativeStrategy
-    DDPFullyShardedStrategy
-    DDPShardedStrategy
-    DDPSpawnShardedStrategy
     DDPSpawnStrategy
     DDPStrategy
     DataParallelStrategy
diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py
index 80a055f3c9bef..bfb9e80e0c44f 100644
--- a/docs/source-pytorch/conf.py
+++ b/docs/source-pytorch/conf.py
@@ -294,7 +294,6 @@ def _transform_changelog(path_in: str, path_out: str) -> None:
     "numpy": ("https://numpy.org/doc/stable/", None),
     "PIL": ("https://pillow.readthedocs.io/en/stable/", None),
     "torchmetrics": ("https://torchmetrics.readthedocs.io/en/stable/", None),
-    "fairscale": ("https://fairscale.readthedocs.io/en/latest/", None),
     "graphcore": ("https://docs.graphcore.ai/en/latest/", None),
 }
 
diff --git a/docs/source-pytorch/extensions/plugins.rst b/docs/source-pytorch/extensions/plugins.rst
index 560c26a3e1cda..b9f21a8ad1610 100644
--- a/docs/source-pytorch/extensions/plugins.rst
+++ b/docs/source-pytorch/extensions/plugins.rst
@@ -55,13 +55,11 @@ The full list of built-in precision plugins is listed below.
     ColossalAIPrecisionPlugin
     DeepSpeedPrecisionPlugin
     DoublePrecisionPlugin
-    FullyShardedNativeMixedPrecisionPlugin
     FullyShardedNativeNativeMixedPrecisionPlugin
     HPUPrecisionPlugin
     IPUPrecisionPlugin
     MixedPrecisionPlugin
     PrecisionPlugin
-    ShardedNativeMixedPrecisionPlugin
     TPUBf16PrecisionPlugin
     TPUPrecisionPlugin
 
diff --git a/docs/source-pytorch/guides/speed.rst b/docs/source-pytorch/guides/speed.rst
index 10b957729904c..b95309c9a8a44 100644
--- a/docs/source-pytorch/guides/speed.rst
+++ b/docs/source-pytorch/guides/speed.rst
@@ -28,7 +28,7 @@ GPU Training
 Lightning supports a variety of plugins to speed up distributed GPU training. Most notably:
 
 * :class:`~pytorch_lightning.strategies.DDPStrategy`
-* :class:`~pytorch_lightning.strategies.DDPShardedStrategy`
+* :class:`~pytorch_lightning.strategies.DDPFullyShardedNativeStrategy`
 * :class:`~pytorch_lightning.strategies.DeepSpeedStrategy`
 
 .. code-block:: python
diff --git a/requirements/pytorch/check-avail-strategies.py b/requirements/pytorch/check-avail-strategies.py
index eb3b66b989401..94bb9b924b769 100644
--- a/requirements/pytorch/check-avail-strategies.py
+++ b/requirements/pytorch/check-avail-strategies.py
@@ -1,4 +1,3 @@
 if __name__ == "__main__":
     import bagua  # noqa: F401
     import deepspeed  # noqa: F401
-    import fairscale  # noqa: F401
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 4de4dc15f51b0..4010828b7df13 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,5 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 # colossalai>=0.1.10  # TODO: uncomment when there's a stable version released
-fairscale>=0.4.5, <0.4.13
 deepspeed>=0.6.0, <=0.7.0
diff --git a/src/lightning_app/components/multi_node/trainer.py b/src/lightning_app/components/multi_node/trainer.py
index e3f738abad329..e3da755bfcb40 100644
--- a/src/lightning_app/components/multi_node/trainer.py
+++ b/src/lightning_app/components/multi_node/trainer.py
@@ -40,7 +40,6 @@ def run(
             try:
                 pkg = importlib.import_module(pkg_name)
                 trainers.append(pkg.Trainer)
-                strategies.append(pkg.strategies.DDPSpawnShardedStrategy)
                 strategies.append(pkg.strategies.DDPSpawnStrategy)
                 mps_accelerators.append(pkg.accelerators.MPSAccelerator)
             except (ImportError, ModuleNotFoundError):
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 5455ce9099869..2a2f15ec8ac2e 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -44,6 +44,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Removed `Trainer(strategy='horovod')` support ([#16150](https://github.com/Lightning-AI/lightning/pull/16150))
 
+- `FairScale` removal (in favor of PyTorch's FSDP implementation) ([#TODO](https://github.com/PyTorchLightning/pytorch-lightning/pull/TODO))
+  * Removed the `pytorch_lightning.overrides.fairscale.LightningShardedDataParallel` class
+  * Removed the `pytorch_lightning.plugins.precision.fully_sharded_native_amp.FullyShardedNativeMixedPrecisionPlugin` class
+  * Removed the `pytorch_lightning.plugins.precision.sharded_native_amp.ShardedNativeMixedPrecisionPlugin` class
+  * Removed the `pytorch_lightning.strategies.fully_sharded.DDPFullyShardedStrategy` (fsdp) class
+  * Removed the `pytorch_lightning.strategies.sharded.DDPShardedStrategy` (ddp_sharded) class
+  * Removed the `pytorch_lightning.strategies.sharded_spawn.DDPSpawnShardedStrategy` (ddp_sharded_spawn) class
+
 - Removed legacy device arguments in Trainer ([#16171](https://github.com/Lightning-AI/lightning/pull/16171))
   * Removed the `Trainer(gpus=...)` argument
   * Removed the `Trainer(tpu_cores=...)` argument
diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py
index c7705775bc267..1a347cd202abc 100644
--- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py
+++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py
@@ -25,7 +25,7 @@
 import pytorch_lightning as pl
 from lightning_fabric.utilities.types import LRScheduler
 from pytorch_lightning.callbacks.callback import Callback
-from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy
+from pytorch_lightning.strategies import DeepSpeedStrategy
 from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn
@@ -146,7 +146,7 @@ def pl_module_contains_batch_norm(pl_module: "pl.LightningModule") -> bool:
         return any(isinstance(module, nn.modules.batchnorm._BatchNorm) for module in pl_module.modules())
 
     def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None:
-        if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy, DeepSpeedStrategy)):
+        if isinstance(trainer.strategy, (DDPFullyShardedNativeStrategy, DeepSpeedStrategy)):
             raise MisconfigurationException("SWA does not currently support sharded models.")
 
         # copy the model before moving it to accelerator device.
diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py
deleted file mode 100644
index 93b100f9e3135..0000000000000
--- a/src/pytorch_lightning/overrides/fairscale.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-
-from lightning_utilities.core.imports import package_available
-from torch.optim import Optimizer
-
-from lightning_fabric.plugins import Precision
-from lightning_fabric.utilities.imports import _IS_WINDOWS
-
-_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and package_available("fairscale")
-
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.optim import OSS
-else:
-    OSS = object
-
-
-def _reinit_optimizers_with_oss(optimizers: List[Optimizer], precision: Precision, num_nodes: int) -> List["OSS"]:
-    for x, optimizer in enumerate(optimizers):
-        if not isinstance(optimizer, OSS):
-            optim_class = type(optimizer)
-            zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults)
-            is_fp16 = precision.precision == "16"
-            # For multi-node training, compressing the model shards in fp16 before broadcasting
-            # improves performance. When using PyTorch AMP, it will not degrade
-            # the model performance.
-            zero_optimizer.broadcast_fp16 = is_fp16 and num_nodes > 1
-            optimizers[x] = zero_optimizer
-            del optimizer
-    return optimizers
diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py
index a9c393a0dd07e..769b0d9199214 100644
--- a/src/pytorch_lightning/plugins/__init__.py
+++ b/src/pytorch_lightning/plugins/__init__.py
@@ -8,12 +8,10 @@
 from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin
 from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin
-from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
-from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin
 
@@ -33,8 +31,6 @@
     "HPUPrecisionPlugin",
     "MixedPrecisionPlugin",
     "PrecisionPlugin",
-    "ShardedNativeMixedPrecisionPlugin",
-    "FullyShardedNativeMixedPrecisionPlugin",
     "FullyShardedNativeNativeMixedPrecisionPlugin",
     "TPUPrecisionPlugin",
     "TPUBf16PrecisionPlugin",
diff --git a/src/pytorch_lightning/plugins/precision/__init__.py b/src/pytorch_lightning/plugins/precision/__init__.py
index d200d1c2f3fb8..85e8c7586c89f 100644
--- a/src/pytorch_lightning/plugins/precision/__init__.py
+++ b/src/pytorch_lightning/plugins/precision/__init__.py
@@ -15,12 +15,10 @@
 from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin
 from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin
-from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
-from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin
 
@@ -29,12 +27,10 @@
     "DeepSpeedPrecisionPlugin",
     "DoublePrecisionPlugin",
     "FullyShardedNativeNativeMixedPrecisionPlugin",
-    "FullyShardedNativeMixedPrecisionPlugin",
     "HPUPrecisionPlugin",
     "IPUPrecisionPlugin",
     "MixedPrecisionPlugin",
     "PrecisionPlugin",
-    "ShardedNativeMixedPrecisionPlugin",
     "TPUPrecisionPlugin",
     "TPUBf16PrecisionPlugin",
 ]
diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
deleted file mode 100644
index 904d61f4dffc3..0000000000000
--- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any
-
-from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
-
-
-class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin):
-    """Native AMP for Fully Sharded Training."""
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        rank_zero_deprecation(
-            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
-            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
-            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
-            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
-            " the native version by default."
-        )
-        super().__init__(*args, **kwargs)
-
-    def clip_grad_by_norm(self, *_: Any, **__: Any) -> None:
-        # see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html
-        # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect
-        # for FSDP module. To overcome this, needs to call sharded_module.clip_grad_norm(clip_val)
-        # however we rely on LightningModule's configure_sharded_model to wrap FSDP, it would be hard to
-        # trace back the root FSDP. Now we only support clip by value.
-        raise MisconfigurationException(
-            f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`"
-        )
diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
deleted file mode 100644
index f4f646b4239a2..0000000000000
--- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, Union
-
-from typing_extensions import Literal
-
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
-from pytorch_lightning.plugins.precision.native_amp import MixedPrecisionPlugin
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
-
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.optim import OSS
-    from fairscale.optim.grad_scaler import ShardedGradScaler
-else:
-    OSS = ShardedGradScaler = object
-
-
-class ShardedNativeMixedPrecisionPlugin(MixedPrecisionPlugin):
-    """Native AMP for Sharded Training."""
-
-    def __init__(
-        self, precision: Literal["16", 16, "bf16"], device: str, scaler: Optional[ShardedGradScaler] = None
-    ) -> None:
-        rank_zero_deprecation(
-            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
-            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
-            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
-            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
-            " the native version by default."
-        )
-        if not _FAIRSCALE_AVAILABLE:
-            raise MisconfigurationException(
-                "You have asked for sharded AMP but you have not installed it."
-                " Install `fairscale` using this guide: https://https://github.com/facebookresearch/fairscale"
-            )
-        super().__init__(
-            precision, device, scaler=(ShardedGradScaler() if scaler is None and str(precision) == "16" else None)
-        )
-
-    def clip_grad_by_norm(self, optimizer: "OSS", clip_val: Union[int, float]) -> None:
-        optimizer.clip_grad_norm(clip_val)
diff --git a/src/pytorch_lightning/serve/servable_module_validator.py b/src/pytorch_lightning/serve/servable_module_validator.py
index c3aed93daa570..f654c8a8ab32a 100644
--- a/src/pytorch_lightning/serve/servable_module_validator.py
+++ b/src/pytorch_lightning/serve/servable_module_validator.py
@@ -11,7 +11,7 @@
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.serve.servable_module import ServableModule
-from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy, DDPFullyShardedStrategy, DeepSpeedStrategy
+from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy, DeepSpeedStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
@@ -19,7 +19,6 @@
 _NOT_SUPPORTED_STRATEGIES = (
     DeepSpeedStrategy,
     DDPFullyShardedNativeStrategy,
-    DDPFullyShardedStrategy,
 )
 
 _logger = logging.getLogger(__name__)
diff --git a/src/pytorch_lightning/strategies/__init__.py b/src/pytorch_lightning/strategies/__init__.py
index dcfb11eecb3a2..2807fafbeff6d 100644
--- a/src/pytorch_lightning/strategies/__init__.py
+++ b/src/pytorch_lightning/strategies/__init__.py
@@ -18,13 +18,10 @@
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy  # noqa: F401
 from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy  # noqa: F401
 from pytorch_lightning.strategies.dp import DataParallelStrategy  # noqa: F401
-from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy  # noqa: F401
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.parallel import ParallelStrategy  # noqa: F401
-from pytorch_lightning.strategies.sharded import DDPShardedStrategy  # noqa: F401
-from pytorch_lightning.strategies.sharded_spawn import DDPSpawnShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy  # noqa: F401
 from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.single_tpu import SingleTPUStrategy  # noqa: F401
diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py
index 5c5db3bcfbd5b..52209ee72d97f 100644
--- a/src/pytorch_lightning/strategies/ddp.py
+++ b/src/pytorch_lightning/strategies/ddp.py
@@ -39,7 +39,6 @@
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase
 from pytorch_lightning.overrides.distributed import prepare_for_backward
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
 from pytorch_lightning.strategies.parallel import ParallelStrategy
@@ -49,10 +48,6 @@
 from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only
 from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep
 
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.optim import OSS
-else:
-    OSS = object
 if torch.distributed.is_available():
     from torch.distributed.algorithms.model_averaging.averagers import ModelAverager
 
@@ -230,7 +225,6 @@ def _enable_model_averaging(self) -> None:
             if (
                 is_distributed_optimizer
                 or isinstance(optimizer, ZeroRedundancyOptimizer)
-                or (_FAIRSCALE_AVAILABLE and isinstance(optimizer, OSS))
                 or isinstance(optimizer, PostLocalSGDOptimizer)
             ):
                 raise ValueError(
diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py
deleted file mode 100644
index 534fdf8dbbe32..0000000000000
--- a/src/pytorch_lightning/strategies/fully_sharded.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import contextlib
-import logging
-from typing import Any, Dict, Generator, List, Optional
-
-import torch
-from torch.optim import Optimizer
-
-import pytorch_lightning as pl
-from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment
-from lightning_fabric.utilities.optimizer import _optimizers_to_device
-from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
-from pytorch_lightning.plugins.precision import PrecisionPlugin
-from pytorch_lightning.strategies.ddp import DDPStrategy
-from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.model_helpers import is_overridden
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
-from pytorch_lightning.utilities.types import STEP_OUTPUT
-
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.nn import default_auto_wrap_policy, enable_wrap
-    from fairscale.nn.data_parallel import FullyShardedDataParallel
-else:
-    FullyShardedDataParallel = None
-
-log = logging.getLogger(__name__)
-
-
-class _DDPFullyShardedStrategyModuleWrapper(_LightningModuleWrapperBase):
-    def state_dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:  # type: ignore[override]
-        # this is required because with FSDP lightning_module is empty because weights are sharded.
-        # So we need to call self.trainer.model.state_dict (wrapped version) and use this wraper to
-        # avoid extra keys `_forward_module.layer.weight.` since we want `layer.weight.` in state_dict.
-        return self._forward_module.state_dict(*args, **kwargs)
-
-
-class DDPFullyShardedStrategy(DDPStrategy):
-
-    strategy_name = "ddp_fully_sharded"
-
-    def __init__(
-        self,
-        accelerator: Optional["pl.accelerators.Accelerator"] = None,
-        cpu_offload: bool = False,
-        flatten_parameters: bool = True,
-        reshard_after_forward: bool = True,
-        move_grads_to_cpu: Optional[bool] = None,
-        fp32_reduce_scatter: Optional[bool] = None,
-        compute_dtype: Optional[torch.dtype] = None,
-        bucket_cap_mb: int = 25,
-        min_num_params: int = 100_000_000,
-        state_dict_to_cpu: bool = True,
-        parallel_devices: Optional[List[torch.device]] = None,
-        cluster_environment: Optional[ClusterEnvironment] = None,
-        checkpoint_io: Optional[CheckpointIO] = None,
-        precision_plugin: Optional[PrecisionPlugin] = None,
-        process_group_backend: Optional[str] = None,
-    ):
-        """Plugin for Fully Sharded Data Parallel provided by FairScale.
-
-        .. warning:: ``DDPFullyShardedStrategy`` is in beta and subject to change.
-
-        Full Sharded Training shards the entire model across all available GPUs, allowing you to scale model
-        size, whilst using efficient communication to reduce overhead. In practice, this means we can remain
-        at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar
-        to ZeRO-Stage 3 but has been built for upstreaming to PyTorch.
-
-        For more information
-        `check out FairScale's docs <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__.
-
-        Defaults have been set and options have been exposed, but may require configuration
-        based on your level of memory/speed efficiency. We suggest having a look at
-        `this PR for more information <https://github.com/facebookresearch/fairscale/pull/413>`__.
-
-        Many of the helpful doc strings below came from the original
-        `FairScale documentation <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__.
-
-        Arguments:
-            cpu_offload: Offload FP32 params to CPU. Only usable in precision=16 mode.
-                (Default: False).
-            move_grads_to_cpu: Moves gradient shards to CPU after reduction.
-                Only disable if using CPU based optimizers
-                (Default to ``cpu_offload``).
-            flatten_parameters: Flattens parameter into single contiguous tensor for speed efficiency
-                (Default: True).
-            reshard_after_forward: Reshard parameters after the forward pass, which saves memory but slows
-                down training. This is only relevant when resharding individual layers.
-                (Default: True).
-            fp32_reduce_scatter: Reduce-Scatter gradients in FP32. Only relevant in mixed precision
-                (Default: None).
-            compute_dtype: dtype for full parameters for computation. Default to torch.float32,
-                unless using mixed precision, in which case defaults to torch.float16.
-                (Default: None).
-            bucket_cap_mb: bucket parameters so that gradient reduction
-                can potentially overlap with backward computation.
-                bucket_cap_mb controls the bucket size in MegaBytes (MB).
-                Buckets are sub-divided based on world_size,
-                so the max shard size is roughly bucket_cap_mb / world_size.
-                Values <= 0 disable bucketing.
-                (Default: 25).
-            min_num_params: Number of parameters to wrap when using FairScale ``auto_wrap``.
-                (Default: 1e8)
-            state_dict_to_cpu: Whether to return parameters (returned by :func:`state_dict`) on CPU device.
-                If ``False``, this will default to ``compute_device``.
-                (Default: True).
-        """
-        rank_zero_deprecation(
-            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
-            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
-            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
-            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
-            " the native version by default."
-        )
-        super().__init__(
-            accelerator=accelerator,
-            parallel_devices=parallel_devices,
-            cluster_environment=cluster_environment,
-            checkpoint_io=checkpoint_io,
-            precision_plugin=precision_plugin,
-            process_group_backend=process_group_backend,
-        )
-        self.cpu_offload = cpu_offload
-        self.move_grads_to_cpu = move_grads_to_cpu
-        self.flatten_parameters = flatten_parameters
-        self.reshard_after_forward = reshard_after_forward
-        self.fp32_reduce_scatter = fp32_reduce_scatter
-        self.compute_dtype = compute_dtype
-        self.bucket_cap_mb = bucket_cap_mb
-        self.min_num_params = min_num_params
-        self.state_dict_device = torch.device("cpu") if state_dict_to_cpu else None
-        self._process_group = None
-
-    @property
-    def process_group(self) -> Any:
-        if self._process_group is None:
-            self._process_group = torch.distributed.new_group()
-        return self._process_group
-
-    def lightning_module_state_dict(self) -> Dict[str, Any]:
-        """Returns model state."""
-        assert self.model is not None
-        return self.model.state_dict()
-
-    def connect(self, model: "pl.LightningModule") -> None:
-        """Called by the accelerator to connect the accelerator and the model with this plugin."""
-        # TODO: Wait for this issue to resolve and remove this blocker
-        # https://github.com/facebookresearch/fairscale/issues/648
-        # Also make sure to update the tests
-        if not is_overridden("configure_sharded_model", self.lightning_module) and len(list(model.parameters())) == 0:
-            assert self.lightning_module is not None
-            raise MisconfigurationException(
-                f"Using the same instance of model with `trainer.{self.lightning_module.trainer.state.fn}()` is not"
-                " supported with Fairscale FSDP auto-wrap. Please reinitialize your `LightningModule` and pass that."
-            )
-
-        super().connect(model)
-
-    def setup_distributed(self) -> None:
-        if not self.root_device.type == "cuda":
-            raise MisconfigurationException(
-                "You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
-            )
-        super().setup_distributed()
-
-    def setup(self, trainer: "pl.Trainer") -> None:
-        assert self.accelerator
-        self.accelerator.setup(trainer)
-
-        if trainer.state.fn == TrainerFn.FITTING:
-            if self._layer_sync:
-                assert self.model
-                self.model = self._layer_sync.apply(self.model)
-
-        self.configure_ddp()
-        assert isinstance(self.model, pl.LightningModule)
-        self.model = _DDPFullyShardedStrategyModuleWrapper(self.model)
-        assert self.lightning_module is not None
-        if not is_overridden("configure_sharded_model", self.lightning_module):
-            self.model = self._setup_model(self.model)
-        self.setup_optimizers(self.lightning_module.trainer)
-        _optimizers_to_device(self.optimizers, self.root_device)
-        self.barrier()
-
-        self.setup_precision_plugin()
-
-    def setup_optimizers(self, trainer: "pl.Trainer") -> None:
-        invalid_params_error = False
-        try:
-            super().setup_optimizers(trainer)
-        except ValueError as e:
-            if "optimizer got an empty parameter list" not in str(e):
-                raise
-            invalid_params_error = True
-
-        if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
-            raise ValueError(
-                "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
-                " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
-                " `configure_optimizers()` hook."
-            )
-
-    def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel:
-        """Wraps the model into a
-        :class:`~fairscale.nn.data_parallel.fully_sharded_data_parallel.FullyShardedDataParallel` module."""
-        log.detail(f"setting up `Fairscale FSDP` model with device id: {self.root_device.index}.")
-
-        return FullyShardedDataParallel(
-            module=model,
-            process_group=self.process_group,
-            cpu_offload=self.cpu_offload,
-            move_grads_to_cpu=self.move_grads_to_cpu,
-            flatten_parameters=self.flatten_parameters,
-            mixed_precision=(self.precision_plugin.precision == "16"),
-            reshard_after_forward=self.reshard_after_forward,
-            fp32_reduce_scatter=self.fp32_reduce_scatter,
-            compute_dtype=self.compute_dtype,
-            bucket_cap_mb=self.bucket_cap_mb,
-            state_dict_device=self.state_dict_device,
-        )
-
-    @contextlib.contextmanager
-    def model_sharded_context(self) -> Generator:
-        log.detail(f"{self.__class__.__name__}: entered model_sharded_context.")
-        precision = self.precision_plugin.precision
-
-        def wrap_policy(*args: Any, **kwargs: Any) -> Any:
-            return default_auto_wrap_policy(*args, **kwargs, min_num_params=self.min_num_params)
-
-        with enable_wrap(
-            wrapper_cls=FullyShardedDataParallel,
-            auto_wrap_policy=wrap_policy,
-            process_group=self.process_group,
-            cpu_offload=self.cpu_offload,
-            move_grads_to_cpu=self.move_grads_to_cpu,
-            flatten_parameters=self.flatten_parameters,
-            mixed_precision=(precision == "16"),
-            reshard_after_forward=self.reshard_after_forward,
-            fp32_reduce_scatter=self.fp32_reduce_scatter,
-            compute_dtype=self.compute_dtype,
-            bucket_cap_mb=self.bucket_cap_mb,
-            state_dict_device=self.state_dict_device,
-        ):
-            yield
-
-        log.detail(f"{self.__class__.__name__}: exiting model_sharded_context.")
-
-    def configure_ddp(self) -> None:
-        log.detail(f"{self.__class__.__name__}: configuring FSDP... (cpu_offload: [{self.cpu_offload}])")
-        if not self.cpu_offload:
-            # When using CPU Offload, FSDP will manage the CUDA movement for us.
-            # Note: this would be problematic for large model (which could not fit in one GPU)
-            # as FSDP module.to(device) would first summon all parameters
-            # (TODO: need to figure out solution)
-            self.model_to_device()
-
-    def model_to_device(self) -> None:
-        log.detail(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
-        # ensure we update the device type in the lightning module
-        assert self.lightning_module
-        self.lightning_module.to(self.root_device)
-
-    def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
-        # we don't need precision context since casting is done by FSDP
-        # read `mixed_precision` docstring here: https://pytorch.org/docs/stable/fsdp.html
-        assert self.model is not None
-        return self.model(*args, **kwargs)
-
-    def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
-        assert self.model is not None
-        return self.model(*args, **kwargs)
-
-    def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
-        assert self.model is not None
-        return self.model(*args, **kwargs)
-
-    def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
-        assert self.model is not None
-        return self.model(*args, **kwargs)
-
-    def post_training_step(self) -> None:
-        pass
-
-    @classmethod
-    def register_strategies(cls, strategy_registry: Dict) -> None:
-        strategy_registry.register(
-            "fsdp", cls, description="Fully sharded training with checkpointing the full state dict."
-        )
-
-        strategy_registry.register(
-            cls.strategy_name,
-            cls,
-            description=f"{cls.__class__.__name__}",
-        )
-
-
-def _optimizer_has_flat_params(optimizer: Optimizer) -> bool:
-    from fairscale.nn.misc.flatten_params_wrapper import FlatParameter
-
-    return any(isinstance(param, FlatParameter) for param in optimizer.param_groups[0]["params"])
diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py
deleted file mode 100644
index bc7b56b7c142d..0000000000000
--- a/src/pytorch_lightning/strategies/sharded.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from contextlib import contextmanager
-from typing import Any, Dict, Generator, List, Tuple
-
-from torch import Tensor
-from torch.nn import Module
-from torch.optim import Optimizer
-
-import pytorch_lightning as pl
-from lightning_fabric.utilities.optimizer import _optimizers_to_device
-from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _reinit_optimizers_with_oss
-from pytorch_lightning.strategies.ddp import DDPStrategy
-from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
-
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
-    from fairscale.optim import OSS
-else:
-    OSS = ShardedDataParallel = object
-
-
-class DDPShardedStrategy(DDPStrategy):
-    """Optimizer and gradient sharded training provided by FairScale."""
-
-    strategy_name = "ddp_sharded"
-    _REDUCE_BUFFER_SIZE_DEFAULT: int = 2**23  # 8M
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        rank_zero_deprecation(
-            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
-            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
-            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
-            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
-            " the native version by default."
-        )
-        super().__init__(*args, **kwargs)
-
-    def connect(self, model: "pl.LightningModule") -> None:
-        if not _FAIRSCALE_AVAILABLE:  # pragma: no cover
-            raise MisconfigurationException(
-                "`DDPShardedStrategy` requires `fairscale` to be installed."
-                " Install it by running `pip install fairscale`."
-            )
-        return super().connect(model)
-
-    def setup(self, trainer: "pl.Trainer") -> None:
-        assert self.accelerator is not None
-        self.accelerator.setup(trainer)
-
-        # move the model to the correct device
-        self.model_to_device()
-
-        # skip wrapping the model if we are not fitting as no gradients need to be exchanged
-        trainer_fn = trainer.state.fn
-        if trainer_fn == TrainerFn.FITTING:
-            if self._layer_sync:
-                assert self.model is not None
-                self.model = self._layer_sync.apply(self.model)
-
-        self.setup_precision_plugin()
-
-        if trainer_fn == TrainerFn.FITTING:
-            self.configure_ddp()
-
-    def configure_ddp(self) -> None:
-        self._set_ddp_kwargs()
-        assert self.lightning_module is not None
-        self.setup_optimizers(self.lightning_module.trainer)
-        assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
-        self.model, self.optimizers = self._setup_model_and_optimizers(
-            model=_LightningModuleWrapperBase(self.model),
-            optimizers=self.optimizers,
-        )
-        _optimizers_to_device(self.optimizers, self.root_device)
-
-    def _set_ddp_kwargs(self) -> None:
-        if "reduce_buffer_size" not in self._ddp_kwargs:
-            # For multi-node training, enabling bucketing will improve performance.
-            self._ddp_kwargs["reduce_buffer_size"] = self._REDUCE_BUFFER_SIZE_DEFAULT if self.num_nodes > 1 else 0
-
-    def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:
-        """Wraps the model and optimizers with fairscale components.
-
-        Return:
-            The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module
-            and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`.
-        """
-        optimizers = self._wrap_optimizers(optimizers)
-        model = ShardedDataParallel(model, sharded_optimizer=optimizers, **self._ddp_kwargs)
-        return model, optimizers
-
-    def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]:
-        assert self.lightning_module is not None
-        if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING:
-            return optimizers
-        optimizers = [o._optimizer if isinstance(o, LightningOptimizer) else o for o in optimizers]
-        return _reinit_optimizers_with_oss(optimizers, self.precision_plugin, self.num_nodes)
-
-    def pre_backward(self, closure_loss: Tensor) -> None:
-        pass
-
-    @contextmanager
-    def block_backward_sync(self) -> Generator:
-        """Blocks syncing gradients behaviour on backwards pass.
-
-        This is useful for skipping sync when accumulating gradients, reducing communication overhead
-        Returns: context manager with sync behaviour off
-        """
-        if isinstance(self.model, ShardedDataParallel):
-            with self.model.no_sync():
-                yield None
-        else:
-            yield None
-
-    def post_training_step(self) -> None:
-        pass
-
-    @classmethod
-    def register_strategies(cls, strategy_registry: Dict) -> None:
-        strategy_registry.register(
-            "ddp_sharded_find_unused_parameters_false",
-            cls,
-            description="DDP Sharded Strategy with `find_unused_parameters` as False",
-            find_unused_parameters=False,
-        )
-        strategy_registry.register(
-            cls.strategy_name,
-            cls,
-            description=f"{cls.__class__.__name__}",
-        )
diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py
deleted file mode 100644
index 74fb1f4026ec0..0000000000000
--- a/src/pytorch_lightning/strategies/sharded_spawn.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from contextlib import contextmanager
-from typing import Any, Dict, Generator, List, Tuple
-
-from torch import Tensor
-from torch.nn import Module
-from torch.optim import Optimizer
-
-import pytorch_lightning as pl
-from lightning_fabric.utilities.optimizer import _optimizers_to_device
-from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _reinit_optimizers_with_oss
-from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy
-from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
-
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
-    from fairscale.optim import OSS
-
-else:
-    OSS = ShardedDataParallel = object
-
-
-class DDPSpawnShardedStrategy(DDPSpawnStrategy):
-    """Optimizer sharded training provided by FairScale."""
-
-    strategy_name = "ddp_sharded_spawn"
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        rank_zero_deprecation(
-            "PyTorch Lightning's sharded implementation using FairScale has been deprecated in v1.9.0 and will be"
-            " removed in v2.0.0. You can try using the `Trainer(strategy='fsdp_native')` instead."
-            " The difference is that native FSDP uses PyTorch's implementation and the current strategy uses"
-            " FairScale's implementation (which was upstreamed to PyTorch). After removal, `strategy='fsdp'` will use"
-            " the native version by default."
-        )
-        super().__init__(*args, **kwargs)
-
-    def connect(self, model: "pl.LightningModule") -> None:
-        if not _FAIRSCALE_AVAILABLE:  # pragma: no cover
-            raise MisconfigurationException(
-                "`DDPSpawnShardedStrategy` requires `fairscale` to be installed."
-                " Install it by running `pip install fairscale`."
-            )
-        return super().connect(model)
-
-    def configure_ddp(self) -> None:
-        # set up optimizers after the wrapped module has been moved to the device
-        assert self.lightning_module is not None
-        self.setup_optimizers(self.lightning_module.trainer)
-        assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
-        self.model, self.optimizers = self._setup_model_and_optimizers(
-            model=_LightningModuleWrapperBase(self.model), optimizers=self.optimizers
-        )
-        _optimizers_to_device(self.optimizers, self.root_device)
-
-    def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:
-        """Wraps the model and optimizers with fairscale components.
-
-        Return:
-            The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module
-            and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`.
-        """
-        optimizers = self._wrap_optimizers(optimizers)
-        model = ShardedDataParallel(model, sharded_optimizer=optimizers, **self._ddp_kwargs)
-        return model, optimizers
-
-    def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]:
-        assert self.lightning_module
-        if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING:
-            return optimizers
-        optimizers = [o._optimizer if isinstance(o, LightningOptimizer) else o for o in optimizers]
-        return _reinit_optimizers_with_oss(optimizers, self.precision_plugin, self.num_nodes)
-
-    @contextmanager
-    def block_backward_sync(self) -> Generator:
-        """Blocks syncing gradients behaviour on backwards pass.
-
-        This is useful for skipping sync when accumulating gradients, reducing communication overhead
-        Returns: context manager with sync behaviour off
-        """
-        if isinstance(self.model, ShardedDataParallel):
-            with self.model.no_sync():
-                yield None
-        else:
-            yield None
-
-    def pre_backward(self, closure_loss: Tensor) -> None:
-        pass
-
-    def post_training_step(self) -> None:
-        pass
-
-    @classmethod
-    def register_strategies(cls, strategy_registry: Dict) -> None:
-        strategy_registry.register(
-            "ddp_sharded_spawn_find_unused_parameters_false",
-            cls,
-            description="DDP Spawn Sharded Strategy with `find_unused_parameters` as False",
-            find_unused_parameters=False,
-        )
-        strategy_registry.register(
-            cls.strategy_name,
-            cls,
-            description=f"{cls.__class__.__name__}",
-        )
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 6e23d22d2f315..53e7dd227480e 100644
--- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -42,13 +42,11 @@
     ColossalAIPrecisionPlugin,
     DeepSpeedPrecisionPlugin,
     DoublePrecisionPlugin,
-    FullyShardedNativeMixedPrecisionPlugin,
     HPUPrecisionPlugin,
     IPUPrecisionPlugin,
     MixedPrecisionPlugin,
     PLUGIN_INPUT,
     PrecisionPlugin,
-    ShardedNativeMixedPrecisionPlugin,
     TPUBf16PrecisionPlugin,
     TPUPrecisionPlugin,
 )
@@ -58,9 +56,6 @@
 from pytorch_lightning.strategies import (
     ColossalAIStrategy,
     DDPFullyShardedNativeStrategy,
-    DDPFullyShardedStrategy,
-    DDPShardedStrategy,
-    DDPSpawnShardedStrategy,
     DDPSpawnStrategy,
     DDPStrategy,
     DeepSpeedStrategy,
@@ -580,12 +575,8 @@ def _check_and_init_precision(self) -> PrecisionPlugin:
             )
             device = "cpu" if self._accelerator_flag == "cpu" else "cuda"
 
-            if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)):
-                return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device)
             if isinstance(self.strategy, DDPFullyShardedNativeStrategy):
                 return FullyShardedNativeNativeMixedPrecisionPlugin(self._precision_flag, device)
-            if isinstance(self.strategy, DDPFullyShardedStrategy):
-                return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device)
             return MixedPrecisionPlugin(self._precision_flag, device)
 
         raise RuntimeError("No precision set")
@@ -670,10 +661,7 @@ def is_distributed(self) -> bool:
             return self.strategy.is_distributed
         distributed_strategy = (
             DDPStrategy,
-            DDPSpawnShardedStrategy,
-            DDPShardedStrategy,
             DDPFullyShardedNativeStrategy,
-            DDPFullyShardedStrategy,
             DDPSpawnStrategy,
             DeepSpeedStrategy,
             TPUSpawnStrategy,
diff --git a/tests/tests_app/components/multi_node/test_trainer.py b/tests/tests_app/components/multi_node/test_trainer.py
index 7cd28a0e09992..be6a48430d099 100644
--- a/tests/tests_app/components/multi_node/test_trainer.py
+++ b/tests/tests_app/components/multi_node/test_trainer.py
@@ -94,6 +94,3 @@ def test_trainer_run_executor_arguments_choices(
 def test_trainer_run_executor_invalid_strategy_instances():
     with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."):
         _, _ = _get_args_after_tracer_injection(strategy=pl.strategies.DDPSpawnStrategy())
-
-    with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."):
-        _, _ = _get_args_after_tracer_injection(strategy=pl.strategies.DDPSpawnShardedStrategy())
diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py
index 87f5b91f48f1f..9368ea20ad21e 100644
--- a/tests/tests_pytorch/helpers/runif.py
+++ b/tests/tests_pytorch/helpers/runif.py
@@ -24,7 +24,6 @@
 from pytorch_lightning.accelerators.mps import MPSAccelerator
 from pytorch_lightning.accelerators.tpu import TPUAccelerator
 from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE
 from pytorch_lightning.strategies.colossalai import _COLOSSALAI_AVAILABLE
 from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE
@@ -62,7 +61,6 @@ def __new__(
         mps: Optional[bool] = None,
         skip_windows: bool = False,
         standalone: bool = False,
-        fairscale: bool = False,
         deepspeed: bool = False,
         rich: bool = False,
         omegaconf: bool = False,
@@ -90,7 +88,6 @@ def __new__(
             skip_windows: Skip for Windows platform.
             standalone: Mark the test as standalone, our CI will run it in a separate process.
                 This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set.
-            fairscale: Require that facebookresearch/fairscale is installed.
             deepspeed: Require that microsoft/DeepSpeed is installed.
             rich: Require that willmcgugan/rich is installed.
             omegaconf: Require that omry/omegaconf is installed.
@@ -179,14 +176,6 @@ def __new__(
             # used in conftest.py::pytest_collection_modifyitems
             kwargs["standalone"] = True
 
-        if fairscale:
-            if skip_windows:
-                raise ValueError(
-                    "`skip_windows` is not necessary when `fairscale` is set as it does not support Windows."
-                )
-            conditions.append(not _FAIRSCALE_AVAILABLE)
-            reasons.append("Fairscale")
-
         if deepspeed:
             conditions.append(not _DEEPSPEED_AVAILABLE)
             reasons.append("Deepspeed")
diff --git a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py
deleted file mode 100644
index e040523c1e9c9..0000000000000
--- a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
-from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin
-from tests_pytorch.helpers.runif import RunIf
-
-ShardedGradScaler = None
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.optim.grad_scaler import ShardedGradScaler
-
-
-@RunIf(fairscale=True)
-@pytest.mark.parametrize(
-    "precision,scaler,expected",
-    [
-        (16, torch.cuda.amp.GradScaler(), torch.cuda.amp.GradScaler),
-        (16, None, ShardedGradScaler),
-        ("bf16", None, None),
-        (32, None, None),
-    ],
-)
-def test_sharded_precision_scaler(precision, scaler, expected):
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        plugin = ShardedNativeMixedPrecisionPlugin(precision=precision, scaler=scaler, device="cuda")
-    if expected:
-        assert isinstance(plugin.scaler, expected)
-    else:
-        assert not plugin.scaler
diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py
index 8a96bd8fdd90c..368aec30e34a8 100644
--- a/tests/tests_pytorch/plugins/test_cluster_integration.py
+++ b/tests/tests_pytorch/plugins/test_cluster_integration.py
@@ -19,7 +19,7 @@
 
 from lightning_fabric.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning import Trainer
-from pytorch_lightning.strategies import DDPShardedStrategy, DDPStrategy, DeepSpeedStrategy
+from pytorch_lightning.strategies import DDPStrategy, DeepSpeedStrategy
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 from tests_pytorch.helpers.runif import RunIf
 
@@ -59,23 +59,17 @@ def environment_combinations():
 @RunIf(mps=False)
 @pytest.mark.parametrize(
     "strategy_cls",
-    [DDPStrategy, DDPShardedStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))],
+    [DDPStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))],
 )
 @mock.patch("pytorch_lightning.accelerators.cuda.CUDAAccelerator.is_available", return_value=True)
 def test_ranks_available_manual_strategy_selection(_, strategy_cls):
     """Test that the rank information is readily available after Trainer initialization."""
     num_nodes = 2
-    for i, (cluster, variables, expected) in enumerate(environment_combinations()):
+    for cluster, variables, expected in environment_combinations():
         with mock.patch.dict(os.environ, variables):
-            if strategy_cls is DDPShardedStrategy and i == 0:
-                with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-                    strategy = strategy_cls(
-                        parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster
-                    )
-            else:
-                strategy = strategy_cls(
-                    parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster
-                )
+            strategy = strategy_cls(
+                parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)], cluster_environment=cluster
+            )
             trainer = Trainer(strategy=strategy, num_nodes=num_nodes)
             assert rank_zero_only.rank == expected["global_rank"]
             assert trainer.global_rank == expected["global_rank"]
@@ -89,7 +83,6 @@ def test_ranks_available_manual_strategy_selection(_, strategy_cls):
     "trainer_kwargs",
     [
         dict(strategy="ddp", accelerator="gpu", devices=[1, 2]),
-        dict(strategy="ddp_sharded", accelerator="gpu", devices=[1, 2]),
         dict(strategy="ddp_spawn", accelerator="cpu", devices=2),
         dict(strategy="ddp_spawn", accelerator="gpu", devices=[1, 2]),
     ],
@@ -99,7 +92,7 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar
     num_nodes = 2
     trainer_kwargs.update(num_nodes=num_nodes)
 
-    for i, (cluster, variables, expected) in enumerate(environment_combinations()):
+    for cluster, variables, expected in environment_combinations():
         if trainer_kwargs["strategy"] == "ddp_spawn":
             if isinstance(cluster, (SLURMEnvironment, TorchElasticEnvironment)):
                 # slurm and torchelastic do not work with spawn strategies
@@ -108,11 +101,7 @@ def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwar
             expected.update(global_rank=(expected["node_rank"] * 2), local_rank=0)
 
         with mock.patch.dict(os.environ, variables):
-            if "sharded" in trainer_kwargs["strategy"] and i == 0:
-                with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-                    trainer = Trainer(**trainer_kwargs)
-            else:
-                trainer = Trainer(**trainer_kwargs)
+            trainer = Trainer(**trainer_kwargs)
             assert type(trainer.strategy.cluster_environment) is type(cluster)
             assert rank_zero_only.rank == expected["global_rank"]
             assert trainer.global_rank == expected["global_rank"]
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
deleted file mode 100644
index a60c4d8cb8ecf..0000000000000
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import os
-from typing import Any, Dict, Optional
-from unittest import mock
-
-import pytest
-import torch
-
-from pytorch_lightning import Trainer
-from pytorch_lightning.demos.boring_classes import BoringModel
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
-from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.strategies import DDPFullyShardedStrategy
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests_pytorch.helpers.runif import RunIf
-
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.nn import FullyShardedDataParallel, wrap
-
-
-class TestFSDPModelManualWrapped(BoringModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.layer: Optional[torch.nn.Module] = None
-
-    def _init_model(self) -> None:
-        self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2))
-
-    def setup(self, stage: str) -> None:
-        if self.layer is None:
-            self._init_model()
-
-    def configure_sharded_model(self) -> None:
-        # the model is already wrapped with FSDP: no need to wrap again!
-        if isinstance(self.layer, FullyShardedDataParallel):
-            return
-        for i, layer in enumerate(self.layer):
-            if i % 2 == 0:
-                self.layer[i] = wrap(layer)
-        self.layer = wrap(self.layer)
-
-    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        # when loading full state dict, we first need to create a new unwrapped model
-        self._init_model()
-
-    def configure_optimizers(self):
-        return torch.optim.SGD(self.layer.parameters(), lr=0.1)
-
-    def on_train_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def on_test_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def on_validation_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def on_prediction_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def _assert_layer_fsdp_instance(self) -> None:
-        assert isinstance(self.layer, FullyShardedDataParallel)
-        assert isinstance(self.layer.module[0], FullyShardedDataParallel)
-        assert isinstance(self.layer.module[2], FullyShardedDataParallel)
-
-        # Assert that the nested layers are set reshard_after_forward to True
-        assert self.layer.module[0].reshard_after_forward
-        assert self.layer.module[2].reshard_after_forward
-
-        if isinstance(self.trainer.precision_plugin, FullyShardedNativeMixedPrecisionPlugin):
-            assert self.layer.mixed_precision
-            assert self.layer.module[0].mixed_precision
-            assert self.layer.module[2].mixed_precision
-
-
-class TestFSDPModelAutoWrapped(BoringModel):
-    def __init__(self):
-        super().__init__()
-        self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2))
-
-    def configure_optimizers(self):
-        return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1)
-
-    def on_train_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def on_test_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def on_validation_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def on_prediction_batch_end(self, *_, **__) -> None:
-        self._assert_layer_fsdp_instance()
-
-    def _assert_layer_fsdp_instance(self) -> None:
-        assert isinstance(self.trainer.model, FullyShardedDataParallel)
-        # `disable_reshard_on_root=True` (default) in FSDP which turns-off resharding
-        assert not self.trainer.model.reshard_after_forward
-
-        if isinstance(self.trainer.precision_plugin, FullyShardedNativeMixedPrecisionPlugin):
-            assert self.trainer.model.mixed_precision
-
-
-def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModelManualWrapped):
-    # Use FullySharded to get the state dict for the sake of comparison
-    model_state_dict = trainer.strategy.lightning_module_state_dict()
-
-    if trainer.is_global_zero:
-        saved_model = cls.load_from_checkpoint(ckpt_path)
-
-        # Assert model parameters are identical after loading
-        for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()):
-            assert torch.equal(ddp_param.float().cpu(), shard_param)
-
-
-def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
-    trainer.fit(model)
-
-    model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path
-
-    trainer.save_checkpoint(model_path, weights_only=True)
-
-    _assert_save_equality(trainer, model_path, cls=model.__class__)
-
-    # Test entry point
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.test(model)  # model is wrapped, will not call configure_shared_model
-
-    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.test(model, ckpt_path=model_path)
-
-    # Predict entry point
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.predict(model)  # model is wrapped, will not call `configure_sharded_model`
-
-    # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap
-    if model.__class__ is TestFSDPModelAutoWrapped:
-        model = TestFSDPModelAutoWrapped()
-    trainer.predict(model, ckpt_path=model_path)
-
-
-def test_invalid_on_cpu(tmpdir):
-    """Test to ensure that to raise Misconfiguration for FSDP on CPU."""
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp")
-    assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
-    with pytest.raises(
-        MisconfigurationException, match="You selected strategy to be `ddp_fully_sharded`, but GPU is not available."
-    ):
-        trainer.strategy.setup_environment()
-
-
-@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
-@RunIf(fairscale=True)
-def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir):
-    """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16
-        )
-    assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
-    assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin)
-
-
-@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
-def test_fully_sharded_strategy_checkpoint(tmpdir):
-    """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
-
-    model = TestFSDPModelManualWrapped()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            accelerator="gpu",
-            devices=1,
-            strategy="fsdp",
-            precision=16,
-            max_epochs=1,
-            enable_progress_bar=False,
-            enable_model_summary=False,
-        )
-    _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
-
-
-@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
-def test_fsdp_gradient_clipping_raises(tmpdir):
-    """Test to ensure that an exception is raised when clipping gradients by value with FSDP."""
-    model = TestFSDPModelManualWrapped()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            strategy="fsdp",
-            fast_dev_run=True,
-            accelerator="gpu",
-            devices=1,
-            precision=16,
-            gradient_clip_val=1,
-            gradient_clip_algorithm="norm",
-            enable_progress_bar=False,
-            enable_model_summary=False,
-        )
-    with pytest.raises(
-        MisconfigurationException, match="gradient_clip_algorithm='norm'` is currently not supported for `FullySharded"
-    ):
-        trainer.fit(model)
-
-
-@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
-def test_fsdp_rewrap_limitation(tmpdir):
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            accelerator="gpu",
-            devices=1,
-            max_steps=1,
-            limit_val_batches=0,
-            limit_test_batches=1,
-            strategy="fsdp",
-        )
-    model = TestFSDPModelAutoWrapped()
-    trainer.fit(model)
-
-    with pytest.raises(MisconfigurationException, match="Using the same instance of model .* not supported"):
-        trainer.test(model)
-
-
-@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True)
-def test_invalid_parameters_in_optimizer():
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(strategy="fsdp", accelerator="gpu", devices=1)
-
-    class EmptyParametersModel(BoringModel):
-        def configure_optimizers(self):
-            return torch.optim.Adam(self.parameters(), lr=1e-2)
-
-    model = EmptyParametersModel()
-    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
-        trainer.fit(model)
-
-    class NoFlatParametersModel(BoringModel):
-        def configure_optimizers(self):
-            layer = torch.nn.Linear(4, 5)
-            return torch.optim.Adam(layer.parameters(), lr=1e-2)
-
-    model = NoFlatParametersModel()
-    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
-        trainer.fit(model)
diff --git a/tests/tests_pytorch/strategies/test_registry.py b/tests/tests_pytorch/strategies/test_registry.py
index 39e10a05fc328..9e249e4f14a43 100644
--- a/tests/tests_pytorch/strategies/test_registry.py
+++ b/tests/tests_pytorch/strategies/test_registry.py
@@ -16,9 +16,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import CheckpointIO
 from pytorch_lightning.strategies import (
-    DDPFullyShardedStrategy,
-    DDPShardedStrategy,
-    DDPSpawnShardedStrategy,
+    DDPFullyShardedNativeStrategy,
     DDPSpawnStrategy,
     DDPStrategy,
     DeepSpeedStrategy,
@@ -67,17 +65,14 @@ def test_tpu_spawn_debug_strategy_registry(xla_available):
     assert isinstance(trainer.strategy, TPUSpawnStrategy)
 
 
-def test_fsdp_strategy_registry(tmpdir):
-
-    strategy = "fsdp"
-
+@RunIf(min_torch="1.12")
+def test_fsdp_strategy_registry(cuda_count_1):
+    strategy = "fsdp_native"
     assert strategy in StrategyRegistry
-    assert StrategyRegistry[strategy]["strategy"] == DDPFullyShardedStrategy
-
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(strategy=strategy)
+    assert StrategyRegistry[strategy]["strategy"] == DDPFullyShardedNativeStrategy
 
-    assert isinstance(trainer.strategy, DDPFullyShardedStrategy)
+    trainer = Trainer(accelerator="cuda", strategy=strategy)
+    assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy)
 
 
 @pytest.mark.parametrize(
@@ -105,24 +100,10 @@ def test_fsdp_strategy_registry(tmpdir):
             {"find_unused_parameters": False, "start_method": "fork"},
             marks=RunIf(skip_windows=True),
         ),
-        (
-            "ddp_sharded_spawn_find_unused_parameters_false",
-            DDPSpawnShardedStrategy,
-            {"find_unused_parameters": False},
-        ),
-        (
-            "ddp_sharded_find_unused_parameters_false",
-            DDPShardedStrategy,
-            {"find_unused_parameters": False},
-        ),
     ],
 )
 def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params):
-    if "sharded" in strategy_name:
-        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-            trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
-    else:
-        trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
+    trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
     assert isinstance(trainer.strategy, strategy)
     assert strategy_name in StrategyRegistry
     assert StrategyRegistry[strategy_name]["init_params"] == expected_init_params
diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py
deleted file mode 100644
index 29fd4607c521b..0000000000000
--- a/tests/tests_pytorch/strategies/test_sharded_strategy.py
+++ /dev/null
@@ -1,356 +0,0 @@
-import os
-from copy import deepcopy
-from typing import Mapping
-from unittest import mock
-from unittest.mock import Mock
-
-import pytest
-import torch
-from torch import Tensor
-
-from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.demos.boring_classes import BoringModel
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
-from pytorch_lightning.plugins import MixedPrecisionPlugin
-from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy
-from pytorch_lightning.trainer.states import TrainerFn
-from tests_pytorch.helpers.runif import RunIf
-
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
-    from fairscale.optim import OSS
-
-
-class ModelWithAdamOptimizer(BoringModel):
-    def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.layer.parameters(), lr=0.1)
-        return optimizer
-
-
-class CheckModelRestore(ModelWithAdamOptimizer):
-    def __init__(self, old_model_state_dict, old_optimizer_states):
-        super().__init__()
-        self.old_model_state_dict = old_model_state_dict
-        self.old_optimizer_states = old_optimizer_states
-
-    def on_train_start(self):
-        assert all(
-            self._is_equal(actual, expected) for actual, expected in zip(self.state_dict(), self.old_model_state_dict)
-        )
-
-        for optimizer, state in zip(self.trainer.optimizers, self.old_optimizer_states):
-            optimizer_state = self.trainer.strategy.optimizer_state(optimizer)
-            self._is_equal(optimizer_state, state)
-
-    def _is_equal(self, a, b):
-        if isinstance(a, Tensor):
-            return torch.allclose(a, b)
-
-        if isinstance(a, Mapping):
-            return all(self._is_equal(a.get(k, None), b.get(k, None)) for k in b.keys())
-
-        return a == b
-
-
-@pytest.mark.parametrize("clip_val", [0, 10])
-@RunIf(min_cuda_gpus=1, fairscale=True)
-@mock.patch("fairscale.optim.oss.OSS.clip_grad_norm")
-def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_val, tmpdir):
-    """Ensure that clip gradients is only called if the value is greater than 0."""
-    model = BoringModel()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            strategy="ddp_sharded",
-            accelerator="gpu",
-            devices=1,
-            precision=16,
-            fast_dev_run=True,
-            gradient_clip_val=clip_val,
-        )
-    trainer.fit(model)
-    if clip_val > 0:
-        mock_oss_clip_grad_norm.assert_called()
-    else:
-        mock_oss_clip_grad_norm.assert_not_called()
-
-
-@RunIf(fairscale=True)
-@pytest.mark.parametrize(
-    "strategy,expected", [("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPSpawnShardedStrategy)]
-)
-def test_sharded_ddp_choice(strategy, expected):
-    """Test to ensure that strategy is correctly chosen."""
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(fast_dev_run=True, strategy=strategy)
-    assert isinstance(trainer.strategy, expected)
-
-
-@RunIf(min_cuda_gpus=1, fairscale=True)
-@pytest.mark.parametrize(
-    "strategy,expected", [("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPSpawnShardedStrategy)]
-)
-def test_ddp_choice_sharded_amp(strategy, expected):
-    """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=1, precision=16, strategy=strategy)
-    assert isinstance(trainer.strategy, expected)
-    assert isinstance(trainer.precision_plugin, MixedPrecisionPlugin)
-
-
-@RunIf(fairscale=True)
-def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir):
-    """Test to ensure that checkpoint is saved correctly."""
-    model = BoringModel()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
-
-    trainer.fit(model)
-
-    checkpoint_path = os.path.join(tmpdir, "model.pt")
-    trainer.save_checkpoint(checkpoint_path)
-    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
-
-    # Assert model parameters are identical after loading
-    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(trained_param.to("cpu"), loaded_param)
-
-
-@RunIf(min_cuda_gpus=2, fairscale=True)
-def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir):
-    """Test to ensure that checkpoint is saved correctly when using multiple GPUs."""
-    model = BoringModel()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
-
-    trainer.fit(model)
-
-    checkpoint_path = os.path.join(tmpdir, "model.pt")
-    trainer.save_checkpoint(checkpoint_path)
-    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
-
-    # Assert model parameters are identical after loading
-    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(trained_param.to("cpu"), loaded_param)
-
-
-@RunIf(min_cuda_gpus=2, fairscale=True)
-def test_ddp_sharded_strategy_finetune(tmpdir):
-    """Test to ensure that we can save and restart training (simulate fine-tuning)"""
-    model = BoringModel()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
-    trainer.fit(model)
-
-    checkpoint_path = os.path.join(tmpdir, "model.pt")
-    trainer.save_checkpoint(checkpoint_path)
-    saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
-
-    trainer = Trainer(fast_dev_run=True)
-    trainer.fit(saved_model)
-
-
-@RunIf(fairscale=True)
-def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir):
-    """Test to ensure that resuming from checkpoint works."""
-    model = BoringModel()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
-
-    trainer.fit(model)
-
-    checkpoint_path = os.path.join(tmpdir, "model.pt")
-    trainer.save_checkpoint(checkpoint_path)
-
-    model = BoringModel()
-
-    trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
-
-    trainer.fit(model, ckpt_path=checkpoint_path)
-
-
-@RunIf(min_cuda_gpus=1, fairscale=True)
-def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir):
-    """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU."""
-    model = BoringModel()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=1, fast_dev_run=True)
-
-    trainer.fit(model)
-
-    checkpoint_path = os.path.join(tmpdir, "model.pt")
-    trainer.save_checkpoint(checkpoint_path)
-
-    model = BoringModel()
-
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(strategy="ddp_sharded_spawn", accelerator="cpu", devices=2, fast_dev_run=True)
-
-    trainer.fit(model, ckpt_path=checkpoint_path)
-
-
-@RunIf(standalone=True, fairscale=True)
-@pytest.mark.parametrize(
-    "trainer_kwargs",
-    (
-        dict(accelerator="cpu", devices=2),
-        pytest.param(dict(accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)),
-    ),
-)
-def test_ddp_sharded_strategy_test_multigpu(trainer_kwargs):
-    """Test to ensure we can use validate and test without fit."""
-    model = BoringModel()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            strategy="ddp_sharded_spawn",
-            fast_dev_run=True,
-            enable_progress_bar=False,
-            enable_model_summary=False,
-            **trainer_kwargs,
-        )
-
-    trainer.validate(model)
-    trainer.test(model)
-
-
-class BoringModelSharded(BoringModel):
-    def on_train_start(self) -> None:
-        """Check if trainer module is wrapped as ShardedDataParallel during training stage."""
-        assert isinstance(self.trainer.model, ShardedDataParallel)
-
-    def on_test_start(self) -> None:
-        """Check if trainer module remains as LightningModule during test stage."""
-        assert isinstance(self.trainer.model, LightningModule)
-
-    def on_validation_start(self) -> None:
-        """Check if trainer module remains as LightningModule during test stage."""
-        if self.trainer.state.fn == TrainerFn.FITTING:
-            assert isinstance(self.trainer.model, ShardedDataParallel)
-        else:
-            assert isinstance(self.trainer.model, LightningModule)
-
-    def on_predict_start(self) -> None:
-        """Check if trainer module remains as LightningModule during prediction stage."""
-        assert isinstance(self.trainer.model, LightningModule)
-
-
-@RunIf(fairscale=True)
-def test_configure_ddp(tmpdir):
-    """Tests with ddp sharded strategy."""
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True)
-
-    model = BoringModelSharded()
-
-    trainer.fit(model)
-    trainer.test(model, dataloaders=model.test_dataloader())
-    trainer.validate(model, dataloaders=model.val_dataloader())
-    trainer.predict(model, dataloaders=model.predict_dataloader())
-
-
-@RunIf(fairscale=True)
-@mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True)
-@pytest.mark.parametrize("cls", [DDPShardedStrategy, DDPSpawnShardedStrategy])
-def test_custom_kwargs_sharded(_, cls):
-    """Tests to ensure that if custom kwargs are passed, they are set correctly."""
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        strategy = cls(reduce_fp16=True)
-    strategy._lightning_module = Mock(spec=LightningModule)
-    strategy._lightning_module.trainer = Mock()
-    strategy.parallel_devices = [Mock()]
-    class_name = "sharded" if isinstance(strategy, DDPShardedStrategy) else "sharded_spawn"
-
-    with mock.patch(f"pytorch_lightning.strategies.{class_name}.ShardedDataParallel", autospec=True) as mock_sharded:
-        strategy.configure_ddp()
-    args, kwargs = mock_sharded.call_args
-    assert "reduce_fp16" in kwargs
-    assert kwargs["reduce_fp16"]
-
-
-@RunIf(fairscale=True)
-@mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True)
-@pytest.mark.parametrize(["params", "expected_buffer_size"], [(dict(), 0), (dict(reduce_buffer_size=128), 128)])
-@pytest.mark.parametrize("num_nodes", [1, 2])
-def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_size, num_nodes):
-    """Tests to ensure that ``reduce_buffer_size`` is correctly set based on user kwargs."""
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        strategy = DDPShardedStrategy(**params)
-    strategy.num_nodes = num_nodes
-    strategy._lightning_module = Mock(spec=LightningModule)
-    strategy._lightning_module.trainer = Mock()
-    strategy.parallel_devices = [Mock()]
-
-    with mock.patch("pytorch_lightning.strategies.sharded.ShardedDataParallel", autospec=True) as mock_sharded:
-        strategy.configure_ddp()
-    args, kwargs = mock_sharded.call_args
-    assert "reduce_buffer_size" in kwargs
-
-    if num_nodes > 1 and len(params) == 0:
-        # If user has not specified a buffer size and we're using multiple nodes, check to see if default is set
-        assert kwargs["reduce_buffer_size"] == DDPShardedStrategy._REDUCE_BUFFER_SIZE_DEFAULT
-    else:
-        assert kwargs["reduce_buffer_size"] == expected_buffer_size
-
-
-@RunIf(fairscale=True)
-def test_block_backward_sync():
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        strategy = DDPShardedStrategy()
-    model = mock.MagicMock(spec=ShardedDataParallel)
-    with mock.patch.object(strategy, "_model", model):
-        with strategy.block_backward_sync():
-            pass
-    model.no_sync.assert_called_once()
-
-
-@pytest.mark.parametrize(
-    "strategy_name,expected_ddp_kwargs",
-    [
-        ("ddp_sharded", {}),
-        ("ddp_sharded_find_unused_parameters_false", {"find_unused_parameters": False}),
-        ("ddp_sharded_spawn", {}),
-        ("ddp_sharded_spawn_find_unused_parameters_false", {"find_unused_parameters": False}),
-    ],
-)
-def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(strategy=strategy_name)
-    assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
-
-
-class BoringFairScaleOptimizerModel(BoringModel):
-    def configure_optimizers(self):
-        base_optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-        return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults)
-
-
-@RunIf(min_cuda_gpus=2, fairscale=True)
-def test_ddp_sharded_strategy_fit_ckpt_path_downsize_gpus(tmpdir):
-    model = ModelWithAdamOptimizer()
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            strategy="ddp_sharded_spawn",
-            max_epochs=1,
-            limit_train_batches=1,
-            limit_val_batches=0,
-            accelerator="gpu",
-            devices=2,
-        )
-    trainer.fit(model)
-
-    checkpoint_path = trainer.checkpoint_callback.best_model_path
-    ckpt = torch.load(checkpoint_path)
-    old_model_state_dict = deepcopy(ckpt["state_dict"])
-    old_optimizer_states = deepcopy(ckpt["optimizer_states"])
-
-    model = CheckModelRestore(old_model_state_dict, old_optimizer_states)
-    with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        trainer = Trainer(
-            strategy="ddp_sharded_spawn",
-            max_epochs=2,
-            limit_train_batches=1,
-            limit_val_batches=0,
-            accelerator="gpu",
-            devices=1,
-        )
-    trainer.fit(model, ckpt_path=checkpoint_path)
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
index 2248c14fee75c..6d0f02250cf25 100644
--- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -39,8 +39,6 @@
 from pytorch_lightning.strategies import (
     DataParallelStrategy,
     DDPFullyShardedNativeStrategy,
-    DDPShardedStrategy,
-    DDPSpawnShardedStrategy,
     DDPSpawnStrategy,
     DDPStrategy,
     DeepSpeedStrategy,
@@ -241,11 +239,6 @@ def test_interactive_incompatible_backend_error(cuda_count_2, monkeypatch):
     with pytest.raises(MisconfigurationException, match=r"strategy='ddp_spawn'\)`.*is not compatible"):
         Trainer(strategy="ddp_spawn", accelerator="gpu", devices=2)
 
-    with pytest.raises(
-        MisconfigurationException, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible"
-    ), pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-        Trainer(strategy="ddp_sharded_spawn", accelerator="gpu", devices=2)
-
     with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"):
         # Edge case: AcceleratorConnector maps dp to ddp if accelerator != gpu
         Trainer(strategy="dp")
@@ -277,20 +270,12 @@ def test_interactive_compatible_strategy_ddp_fork(monkeypatch):
     [
         ("ddp", DDPStrategy),
         ("ddp_spawn", DDPSpawnStrategy),
-        ("ddp_sharded", DDPShardedStrategy),
-        ("ddp_sharded_spawn", DDPSpawnShardedStrategy),
         pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
     ],
 )
 @pytest.mark.parametrize("devices", [1, 2])
 def test_accelerator_choice_multi_node_gpu(cuda_count_2, tmpdir, strategy, strategy_class, devices):
-    if "sharded" in strategy:
-        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-            trainer = Trainer(
-                default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices
-            )
-    else:
-        trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices)
+    trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices)
     assert isinstance(trainer.strategy, strategy_class)
 
 
@@ -380,23 +365,15 @@ def test_exception_invalid_strategy():
         ("ddp", DDPStrategy),
         ("ddp_find_unused_parameters_false", DDPStrategy),
         ("dp", DataParallelStrategy),
-        ("ddp_sharded", DDPShardedStrategy),
-        ("ddp_sharded_spawn", DDPSpawnShardedStrategy),
         pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
     ),
 )
 @pytest.mark.parametrize("accelerator", ["mps", "auto", "gpu", None, MPSAccelerator()])
 def test_invalid_ddp_strategy_with_mps(accelerator, strategy, strategy_class, mps_count_1, cuda_count_0):
-    if "sharded" in strategy:
-        with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
-            Trainer(accelerator=accelerator, strategy=strategy)
-    else:
-        with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
-            Trainer(accelerator=accelerator, strategy=strategy)
-
-    with pytest.raises(ValueError, match="strategies from the DDP family are not supported"), pytest.deprecated_call(
-        match="FairScale has been deprecated in v1.9.0"
-    ):
+    with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
+        Trainer(accelerator=accelerator, strategy=strategy)
+
+    with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
         Trainer(accelerator="mps", strategy=strategy_class())
 
 
@@ -420,7 +397,6 @@ def test_strategy_choice_cpu_instance(strategy_class):
     assert isinstance(trainer.strategy, strategy_class)
 
 
-@RunIf(min_cuda_gpus=2)
 @pytest.mark.parametrize(
     ["strategy", "strategy_class"],
     [
@@ -429,31 +405,22 @@ def test_strategy_choice_cpu_instance(strategy_class):
         ("ddp", DDPStrategy),
         ("ddp_find_unused_parameters_false", DDPStrategy),
         ("dp", DataParallelStrategy),
-        ("ddp_sharded", DDPShardedStrategy),
-        ("ddp_sharded_spawn", DDPSpawnShardedStrategy),
         pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
     ],
 )
-def test_strategy_choice_gpu_str(strategy, strategy_class):
-    if "sharded" in strategy:
-        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-            trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2)
-    else:
-        trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2)
+def test_strategy_choice_gpu_str(strategy, strategy_class, cuda_count_2):
+    trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2)
     assert isinstance(trainer.strategy, strategy_class)
 
 
-@RunIf(min_cuda_gpus=2)
 @pytest.mark.parametrize("strategy_class", [DDPSpawnStrategy, DDPStrategy])
-def test_strategy_choice_gpu_instance(strategy_class):
+def test_strategy_choice_gpu_instance(strategy_class, cuda_count_2, mps_count_0):
     trainer = Trainer(strategy=strategy_class(), accelerator="gpu", devices=2)
     assert isinstance(trainer.strategy, strategy_class)
 
 
-@RunIf(min_cuda_gpus=2)
 @pytest.mark.parametrize("strategy_class", [DDPSpawnStrategy, DDPStrategy])
-def test_device_type_when_strategy_instance_gpu_passed(strategy_class):
-
+def test_device_type_when_strategy_instance_gpu_passed(strategy_class, cuda_count_2, mps_count_0):
     trainer = Trainer(strategy=strategy_class(), accelerator="gpu", devices=2)
     assert isinstance(trainer.strategy, strategy_class)
     assert isinstance(trainer.accelerator, CUDAAccelerator)
diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py
index 81bbe0f9f2e15..941e5eafdf7e6 100644
--- a/tests/tests_pytorch/trainer/test_trainer.py
+++ b/tests/tests_pytorch/trainer/test_trainer.py
@@ -51,15 +51,7 @@
 )
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler
-from pytorch_lightning.strategies import (
-    DataParallelStrategy,
-    DDPFullyShardedStrategy,
-    DDPShardedStrategy,
-    DDPSpawnShardedStrategy,
-    DDPSpawnStrategy,
-    DDPStrategy,
-    SingleDeviceStrategy,
-)
+from pytorch_lightning.strategies import DataParallelStrategy, DDPSpawnStrategy, DDPStrategy, SingleDeviceStrategy
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE
@@ -2012,13 +2004,6 @@ def training_step(self, batch, batch_idx):
             CPUAccelerator,
             1,
         ),
-        (
-            {"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1},
-            DDPFullyShardedStrategy,
-            "ddp_fully_sharded",
-            CUDAAccelerator,
-            1,
-        ),
         (
             {"strategy": DDPSpawnStrategy(), "accelerator": "cpu", "devices": 2},
             DDPSpawnStrategy,
@@ -2042,20 +2027,6 @@ def training_step(self, batch, batch_idx):
             CUDAAccelerator,
             2,
         ),
-        (
-            {"strategy": DDPFullyShardedStrategy(), "accelerator": "cuda", "devices": 2},
-            DDPFullyShardedStrategy,
-            "ddp_fully_sharded",
-            CUDAAccelerator,
-            2,
-        ),
-        (
-            {"strategy": DDPSpawnShardedStrategy(), "accelerator": "cuda", "devices": 2},
-            DDPSpawnShardedStrategy,
-            "ddp_sharded_spawn",
-            CUDAAccelerator,
-            2,
-        ),
         (
             {"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2},
             DDPSpawnStrategy,
@@ -2063,39 +2034,13 @@ def training_step(self, batch, batch_idx):
             CUDAAccelerator,
             2,
         ),
-        (
-            {"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1, "num_nodes": 2},
-            DDPFullyShardedStrategy,
-            "ddp_fully_sharded",
-            CUDAAccelerator,
-            1,
-        ),
-        (
-            {"strategy": "ddp_sharded", "accelerator": "cuda", "devices": 2, "num_nodes": 2},
-            DDPShardedStrategy,
-            "ddp_sharded",
-            CUDAAccelerator,
-            2,
-        ),
-        (
-            {"strategy": "ddp_sharded_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2},
-            DDPSpawnShardedStrategy,
-            "ddp_sharded_spawn",
-            CUDAAccelerator,
-            2,
-        ),
     ],
 )
 def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, strategy_name, accelerator_cls, devices):
     if trainer_kwargs.get("accelerator") == "cuda":
         mock_cuda_count(monkeypatch, trainer_kwargs["devices"])
 
-    strategy = trainer_kwargs.get("strategy")
-    if (isinstance(strategy, str) and "sharded" in strategy) or isinstance(strategy, (DDPShardedStrategy)):
-        with pytest.deprecated_call(match="FairScale has been deprecated in v1.9.0"):
-            trainer = Trainer(**trainer_kwargs)
-    else:
-        trainer = Trainer(**trainer_kwargs)
+    trainer = Trainer(**trainer_kwargs)
 
     assert isinstance(trainer.strategy, strategy_cls)
     assert strategy_cls.strategy_name == strategy_name
@@ -2191,7 +2136,7 @@ def on_fit_start(self):
     logger.finalize.assert_called_once_with("failed")
 
 
-# TODO: replace with 1.14 when it is released
+# TODO: replace with 2.0 when it is released
 @RunIf(min_torch="1.14.0.dev20221202")
 def test_trainer_compiled_model():
     model = BoringModel()
@@ -2219,11 +2164,9 @@ def test_trainer_compiled_model():
 
     model = torch.compile(model)
 
-    trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=1, strategy=DDPShardedStrategy)
-
+    trainer = Trainer(fast_dev_run=True, strategy="fsdp_native")
     with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
         trainer.fit(model)
 
-    trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=1, strategy=DDPStrategy)
-
+    trainer = Trainer(fast_dev_run=True, strategy="ddp")
     trainer.fit(model)
diff --git a/tests/tests_pytorch/utilities/test_imports.py b/tests/tests_pytorch/utilities/test_imports.py
index cc24904265cf6..ff1b3d5a463e4 100644
--- a/tests/tests_pytorch/utilities/test_imports.py
+++ b/tests/tests_pytorch/utilities/test_imports.py
@@ -20,7 +20,7 @@
 from unittest import mock
 
 import pytest
-from lightning_utilities.core.imports import compare_version, module_available, RequirementCache
+from lightning_utilities.core.imports import compare_version, RequirementCache
 from torch.distributed import is_available
 
 from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE
@@ -108,18 +108,13 @@ def clean_import():
             _shortcut_patch(RequirementCache.__bool__, ("jsonargparse[signatures]>=4.12.0",), ("requirement",)),
             "pytorch_lightning.cli",
         ),
-        (
-            "lightning_utilities.core.imports.module_available",
-            _shortcut_patch(module_available, ("fairscale.nn",)),
-            "pytorch_lightning.strategies",
-        ),
         (
             "lightning_utilities.core.imports.compare_version",
             _shortcut_patch(compare_version, ("torch", operator.ge, "1.12.0")),
             "pytorch_lightning.strategies.fully_sharded_native",
         ),
     ],
-    ids=["ProcessGroup", "neptune", "cli", "fairscale", "fully_sharded_native"],
+    ids=["ProcessGroup", "neptune", "cli", "fully_sharded_native"],
 )
 def test_import_with_unavailable_dependencies(patch_name, new_fn, to_import, clean_import):
     """This tests simulates unavailability of certain modules by patching the functions that check for their

From 668096946d7c462d99c2b976bb60ff8ecf2d8803 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Fri, 20 Jan 2023 11:11:23 +0100
Subject: [PATCH 2/2] Update src/pytorch_lightning/CHANGELOG.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 src/pytorch_lightning/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index a800ee853d123..0ad48ba732ca1 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -44,7 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Removed `Trainer(strategy='horovod')` support ([#16150](https://github.com/Lightning-AI/lightning/pull/16150))
 
-- `FairScale` removal (in favor of PyTorch's FSDP implementation) ([#TODO](https://github.com/PyTorchLightning/pytorch-lightning/pull/TODO))
+- `FairScale` removal (in favor of PyTorch's FSDP implementation) ([#16400](https://github.com/PyTorchLightning/pytorch-lightning/pull/16400))
   * Removed the `pytorch_lightning.overrides.fairscale.LightningShardedDataParallel` class
   * Removed the `pytorch_lightning.plugins.precision.fully_sharded_native_amp.FullyShardedNativeMixedPrecisionPlugin` class
   * Removed the `pytorch_lightning.plugins.precision.sharded_native_amp.ShardedNativeMixedPrecisionPlugin` class