[FSDP] Adding Native FSDP Strategy

Sisil Mehta · Sisil Mehta · commit 79036254ebfb · 2022-03-26T18:30:24.000-07:00
diff --git a/pytorch_lightning/strategies/__init__.py b/pytorch_lightning/strategies/__init__.py
@@ -19,6 +19,7 @@
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy  # noqa: F401
 from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy  # noqa: F401
 from pytorch_lightning.strategies.dp import DataParallelStrategy  # noqa: F401
+from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
diff --git a/pytorch_lightning/strategies/fully_sharded_native.py b/pytorch_lightning/strategies/fully_sharded_native.py
@@ -0,0 +1,268 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import logging
+import os
+from typing import Union, Any, Generator, Dict, List, Optional
+
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.overrides.distributed import prepare_for_backward
+from pytorch_lightning.plugins.environments.cluster_environment import (
+    ClusterEnvironment,
+)
+from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.precision import PrecisionPlugin
+from pytorch_lightning.strategies.parallel import ParallelStrategy
+from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.distributed import (
+    init_dist_connection,
+    sync_ddp_if_available,
+    ReduceOp,
+    group as _group,
+    _get_process_group_backend_from_env,
+    distributed_available,
+    get_default_process_group_backend_for_device,
+)
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
+from pytorch_lightning.utilities.optimizer import optimizers_to_device
+from pytorch_lightning.utilities.seed import reset_seed
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    BackwardPrefetch,
+    CPUOffload,
+    FullyShardedDataParallel,
+)
+from torch.distributed.fsdp.wrap import enable_wrap
+
+
+log = logging.getLogger(__name__)
+
+
+class DDPFullyShardedNativeStrategy(ParallelStrategy):
+
+    strategy_name = "fsdp_native"
+
+    def __init__(
+        self,
+        accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None,
+        cpu_offload: Optional[CPUOffload] = None,
+        backward_prefetch: Optional[BackwardPrefetch] = None,
+        parallel_devices: Optional[List[torch.device]] = None,
+        cluster_environment: Optional[ClusterEnvironment] = None,
+        checkpoint_io: Optional[CheckpointIO] = None,
+        precision_plugin: Optional[PrecisionPlugin] = None,
+        process_group_backend: Optional[str] = None,
+    ) -> None:
+        """Plugin for Fully Sharded Data Parallel provided by Pytorch.Distributed.
+
+        Full Sharded Training shards the entire model across all available GPUs, allowing you to scale model
+        size, whilst using efficient communication to reduce overhead. In practice, this means we can remain
+        at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar
+        to ZeRO-Stage 3.
+        `For more information: https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/`.
+        .. warning:: ``FullyShardedPlugin`` is in beta and subject to change.
+
+        Defaults have been set and options have been exposed, but may require configuration
+        based on your level of memory/speed efficiency. We suggest having a look at this tutorial for
+        more information.
+        `https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html`
+
+        Arguments:
+            cpu_offload (Optional [CPUOffload]):
+                CPU offloading config. Currently, only parameter and gradient CPU
+                offload is supported. It can be enabled via passing in
+                ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
+                currently implicitly enables gradient offloading to CPU in order for
+                params and grads to be on same device to work with optimizer. This
+                API is subject to change. Default is ``None`` in which case there
+                will be no offloading.
+            backward_prefetch: (Optional[BackwardPrefetch]):
+                This is an experimental feature that is subject to change in the
+                the near future. It allows users to enable two different backward_prefetch
+                algorithms to help backward communication and computation overlapping.
+                Pros and cons of each algorithm is explained in the class ``BackwardPrefetch``.
+        """
+        super().__init__(
+            accelerator=accelerator,
+            parallel_devices=parallel_devices,
+            cluster_environment=cluster_environment,
+            checkpoint_io=checkpoint_io,
+            precision_plugin=precision_plugin,
+        )
+        self._process_group = None
+        self.num_processes = (
+            len(self.parallel_devices) if self.parallel_devices is not None else 0
+        )
+        self._has_loaded_state_dict: bool = False
+        self._process_group_backend: Optional[str] = process_group_backend
+        self.cpu_offload = cpu_offload
+        self.backward_prefetch = backward_prefetch
+
+    @property
+    def root_device(self) -> torch.device:
+        return self.parallel_devices[self.local_rank]
+
+    @property
+    def process_group(self):
+        if self._process_group is None:
+            # The plugin should have already initilized ddp in setup_environment()
+            self._process_group = _get_default_group()
+        return self._process_group
+
+    @property
+    def setup_optimizers_in_pre_dispatch(self) -> bool:
+        # Setup optimizers after the Fully Sharded Model has been made
+        return True
+
+    @property
+    def process_group_backend(self) -> Optional[str]:
+        return self._process_group_backend
+
+    def setup_environment(self) -> None:
+        self.setup_distributed()
+        super().setup_environment()
+
+    def setup_distributed(self) -> None:
+        if not self.root_device.type == "cuda":
+            raise MisconfigurationException(
+                "You selected strategy to be `ddp_fully_sharded_native`, but GPU is not available."
+            )
+        reset_seed()
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
+
+        self._process_group_backend = self._get_process_group_backend()
+        init_dist_connection(self.cluster_environment, self._process_group_backend)
+
+    def _get_process_group_backend(self) -> str:
+        return (
+            self._process_group_backend
+            or _get_process_group_backend_from_env()
+            or get_default_process_group_backend_for_device(self.root_device)
+        )
+
+    def setup(self, trainer: "pl.Trainer") -> None:
+        self.accelerator.setup(trainer)
+
+        if trainer.state.fn == TrainerFn.FITTING and self._layer_sync:
+            self.model = self._layer_sync.apply(self.model)
+
+        if not self.cpu_offload:
+            # When using CPU Offload, FSDP will manage the CUDA movement for us.
+            # Note: this would be problematic for large model (which could not fit in one GPU)
+            # as FSDP module.to(device) would first summon all parameters
+            self.model_to_device()
+
+        self.barrier()
+        self.setup_optimizers(trainer)
+        optimizers_to_device(self.optimizers, self.root_device)
+        self.setup_precision_plugin()
+
+    def model_to_device(self) -> None:
+        # ensure we update the device type in the lightning module
+        log.info(
+            f"{self.__class__.__name__}: moving model to device [{self.root_device}]..."
+        )
+        self.lightning_module.to(self.root_device)
+
+    @contextlib.contextmanager
+    def model_sharded_context(self) -> Generator:
+        log.detail(f"{self.__class__.__name__}: entered model_sharded_context.")
+
+        with enable_wrap(
+            wrapper_cls=FullyShardedDataParallel,
+            process_group=self.process_group,
+            cpu_offload=self.cpu_offload,
+            backward_prefetch=self.backward_prefetch,
+        ):
+            yield
+
+    def barrier(self, *args, **kwargs) -> None:
+        if not distributed_available():
+            return
+        if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl":
+            torch.distributed.barrier(device_ids=self._determine_device_ids())
+        else:
+            torch.distributed.barrier()
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        obj = [obj]
+        if self.global_rank != src:
+            obj = [None]
+        torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+        return obj[0]
+
+    def reduce(
+        self,
+        tensor,
+        group: Optional[Any] = None,
+        reduce_op: Union[ReduceOp, str] = "mean",
+    ) -> torch.Tensor:
+        """Reduces a tensor from several distributed processes to one aggregated tensor.
+
+        Args:
+            tensor: the tensor to sync and reduce
+            group: the process group to gather results from. Defaults to all processes (world)
+            reduce_op: the reduction operation. Defaults to 'mean'/'avg'.
+                Can also be a string 'sum' to calculate the sum during reduction.
+
+        Return:
+            reduced value, except when the input was not a tensor the output remains is unchanged
+        """
+        if isinstance(tensor, torch.Tensor):
+            tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op)
+        return tensor
+
+    def pre_backward(self, closure_loss: torch.Tensor) -> None:
+        """Run before precision plugin executes backward."""
+        if not self.lightning_module.automatic_optimization:
+            prepare_for_backward(self.model, closure_loss)
+
+    def _determine_device_ids(self):
+        if self.root_device.type == "cpu":
+            return None
+        return [self.root_device.index]
+
+    def teardown(self) -> None:
+        log.info(f"{self.__class__.__name__}: tearing down plugin...")
+        super().teardown()
+
+        if self._layer_sync:
+            self.model = self._layer_sync.revert(self.model)
+
+        if self.root_device.type == "cuda":
+            # GPU teardown
+            if not os.environ.get("PL_SKIP_CPU_COPY_ON_DDP_TEARDOWN"):
+                log.info(f"{self.__class__.__name__}: moving model to CPU...")
+                self.lightning_module.cpu()
+            # clean up memory
+            torch.cuda.empty_cache()
+
+        self._has_loaded_state_dict = False
+
+    @classmethod
+    def register_strategies(cls, strategy_registry: Dict) -> None:
+        strategy_registry.register(
+            "fsdp_native",
+            cls,
+            description="Fully Sharded Data Parallel training from pytorch.distributed.",
+        )
+        strategy_registry.register(
+            "fsdp_native_full_shard_offload",
+            cls,
+            description="Native FSDP with Full Sharding and CPU Offloading",
+            cpu_offload=CPUOffload(offload_params=True),
+        )
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -50,6 +50,7 @@
 from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm
 from pytorch_lightning.strategies import (
     DDP2Strategy,
+    DDPFullyShardedNativeStrategy,
     DDPFullyShardedStrategy,
     DDPShardedStrategy,
     DDPSpawnShardedStrategy,
@@ -686,7 +687,7 @@ def _check_and_init_precision(self) -> PrecisionPlugin:
 
                 if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)):
                     return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device)
-                if isinstance(self.strategy, DDPFullyShardedStrategy):
+                if isinstance(self.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy)):
                     return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device)
                 return NativeMixedPrecisionPlugin(self._precision_flag, device)
 
@@ -727,7 +728,7 @@ def _validate_precision_choice(self) -> None:
                 "it's not supported. Try using `amp_type='native'` instead."
             )
         if self._precision_flag in (16, "bf16") and self._amp_type_flag == AMPType.APEX:
-            if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy, DDPFullyShardedStrategy)):
+            if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy, DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy)):
                 raise MisconfigurationException(
                     "Sharded plugins are not supported with apex, please switch to `amp_backend='native'`."
                 )
@@ -813,6 +814,7 @@ def is_distributed(self) -> bool:
             DDPStrategy,
             DDPSpawnShardedStrategy,
             DDPShardedStrategy,
+            DDPFullyShardedNativeStrategy,
             DDPFullyShardedStrategy,
             DDPSpawnStrategy,
             DeepSpeedStrategy,
diff --git a/tests/strategies/test_ddp_fully_sharded_native.py b/tests/strategies/test_ddp_fully_sharded_native.py