fsdp support in lite

awaelchli · awaelchli · commit a2130b95469b · 2022-10-01T15:17:24.000+02:00
diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py
@@ -40,6 +40,7 @@
     TorchElasticEnvironment,
 )
 from lightning_lite.plugins.precision.double import DoublePrecision
+from lightning_lite.plugins.precision.fsdp import FSDPPrecision
 from lightning_lite.strategies import (
     DDPShardedStrategy,
     DDPSpawnShardedStrategy,
@@ -53,7 +54,7 @@
     XLAStrategy,
 )
 from lightning_lite.strategies.ddp_spawn import _DDP_FORK_ALIASES
-from lightning_lite.strategies.fsdp import FSDPStrategy
+from lightning_lite.strategies.fsdp import _FSDP_ALIASES, FSDPStrategy
 from lightning_lite.utilities import _StrategyType, rank_zero_info, rank_zero_warn
 from lightning_lite.utilities.device_parser import determine_root_gpu_device
 from lightning_lite.utilities.imports import _HPU_AVAILABLE, _IPU_AVAILABLE, _IS_INTERACTIVE, _TPU_AVAILABLE
@@ -409,6 +410,13 @@ def _check_strategy_and_fallback(self) -> None:
                 f"You selected `Lite(strategy='{strategy_flag}')` but process forking is not supported on this"
                 f" platform. We recommed `Lite(strategy='ddp_spawn')` instead."
             )
+        if (
+            strategy_flag in _FSDP_ALIASES or isinstance(self._strategy_flag, FSDPStrategy)
+        ) and self._accelerator_flag not in ("cuda", "gpu"):
+            raise ValueError(
+                f"You selected the FSDP strategy but FSDP is only available on GPU. Set `Lite(accelerator='gpu', ...)`"
+                " to continue or select a different strategy."
+            )
         if strategy_flag:
             self._strategy_flag = strategy_flag
 
@@ -457,9 +465,11 @@ def _check_and_init_precision(self) -> Precision:
                 if self._precision_flag == 16
                 else "Using bfloat16 Automatic Mixed Precision (AMP)"
             )
-
             device = "cpu" if self._accelerator_flag == "cpu" else "cuda"
-            return NativeMixedPrecision(self._precision_flag, device)
+
+            if isinstance(self.strategy, FSDPStrategy):
+                return FSDPPrecision(precision=self._precision_flag, device=device)
+            return NativeMixedPrecision(precision=self._precision_flag, device=device)
 
         raise RuntimeError("No precision set")
 
diff --git a/src/lightning_lite/plugins/precision/fsdp.py b/src/lightning_lite/plugins/precision/fsdp.py
@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, TYPE_CHECKING, Literal
+from typing import Literal, Optional, TYPE_CHECKING
 
 import torch
 
-from lightning_lite.utilities.enums import PrecisionType
 from lightning_lite.plugins.precision import NativeMixedPrecision
+from lightning_lite.utilities.enums import PrecisionType
 from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_12
 
 if TYPE_CHECKING:
diff --git a/src/lightning_lite/strategies/fsdp.py b/src/lightning_lite/strategies/fsdp.py
@@ -13,26 +13,25 @@
 # limitations under the License.
 from contextlib import contextmanager
 from datetime import timedelta
-from typing import Any, Dict, Generator, List, Optional, Union, TYPE_CHECKING
+from typing import Any, Dict, Generator, List, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch import Tensor
 from torch.distributed import default_pg_timeout
 from torch.nn import Module
 
 from lightning_lite.accelerators import Accelerator
-from lightning_lite.plugins import CheckpointIO, ClusterEnvironment
+from lightning_lite.plugins import CheckpointIO, ClusterEnvironment, Precision
 from lightning_lite.plugins.precision.fsdp import FSDPPrecision
-from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device, distributed_available
-from lightning_lite.utilities.distributed import group as _group
-from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
-from lightning_lite.utilities.seed import reset_seed
-from lightning_lite.plugins import Precision
 from lightning_lite.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
 from lightning_lite.strategies.parallel import ParallelStrategy
 from lightning_lite.strategies.strategy import TBroadcast
+from lightning_lite.utilities.distributed import distributed_available, get_default_process_group_backend_for_device
+from lightning_lite.utilities.distributed import group as _group
+from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
 from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_12
 from lightning_lite.utilities.rank_zero import rank_zero_only
+from lightning_lite.utilities.seed import reset_seed
 
 if TYPE_CHECKING:
     from torch.distributed.fsdp.fully_sharded_data_parallel import (
@@ -43,11 +42,13 @@
     )
     from torch.distributed.fsdp.wrap import enable_wrap
 
+_FSDP_ALIASES = ("fsdp", "fsdp_full_shard_offload")
+
 
 class FSDPStrategy(ParallelStrategy):
     r"""Strategy for Fully Sharded Data Parallel provided by torch.distributed.
 
-    .. warning:: ``DDPFullyShardedNativeStrategy`` is in BETA and subject to change. The interface can
+    .. warning:: ``FSDPStrategy`` is in BETA and subject to change. The interface can
         bring breaking changes and new features with the next release of PyTorch.
 
     Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model
@@ -62,30 +63,20 @@ class FSDPStrategy(ParallelStrategy):
     `this tutorial <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__ for more information.
 
     Arguments:
-        cpu_offload:
-            CPU offloading config. Currently, only parameter and gradient CPU
-            offload is supported. It can be enabled via passing in
-            ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
-            currently implicitly enables gradient offloading to CPU in order for
-            params and grads to be on same device to work with optimizer. This
-            API is subject to change. Default is ``None`` in which case there
+        cpu_offload: CPU offloading config. Currently, only parameter and gradient CPU offload is supported. It
+            can be enabled via passing in ``cpu_offload=CPUOffload(offload_params=True)``. Note that this currently
+            implicitly enables gradient offloading to CPU in order for parameters and gradients to be on same device
+            to work with the optimizer. This API is subject to change. Default is ``None`` in which case there
             will be no offloading.
-        backward_prefetch:
-            This is an experimental feature that is subject to change in the
-            the near future. It allows users to enable two different backward_prefetch
-            algorithms to help backward communication and computation overlapping.
-            The pros and cons of each algorithm is explained in the class ``BackwardPrefetch``.
-        mixed_precision:
-            Mixed Precision config. By default, Lightning will enable FP16 if ``precision=16``
-            or BF16 if ``precision=bf16`` unless a config is passed in.
-            This is only available in PyTorch 1.12 and later.
-        \**kwargs: Passed to the FSDP context manager which will configure the FSDP class when wrapping modules.
-
+        backward_prefetch: This is an experimental feature that is subject to change in the near future. It allows
+            users to enable two different backward prefetching algorithms to help backward communication and
+            computation overlapping. The pros and cons of each algorithm is explained in the class ``BackwardPrefetch``.
+        mixed_precision: Mixed Precision config. By default, Lightning will enable FP16 if ``precision=16`` or BF16
+            if ``precision=bf16`` unless a config is passed in. This is only available in PyTorch 1.12 and later.
+        \**kwargs: Optional keywoard arguments passed to the FSDP context manager which will configure the FSDP class
+            when wrapping modules.
     """
 
-    strategy_name = "fsdp_native"
-    _registered_strategies: List[str] = []
-
     def __init__(
         self,
         accelerator: Optional[Accelerator] = None,
@@ -169,6 +160,7 @@ def setup_module(self, module: Module) -> FullyShardedDataParallel:
         """Wraps the model into a
         :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module."""
         from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+
         if (
             any(isinstance(mod, FullyShardedDataParallel) for mod in module.modules())
             and "auto_wrap_policy" in self._ddp_kwargs