Merge branch 'master' into tpu-iterable-datasets

awaelchli · web-flow · commit 5cb313e84972 · 2023-04-11T22:15:04.000+02:00
diff --git a/.github/workflows/_legacy-checkpoints.yml b/.github/workflows/_legacy-checkpoints.yml
@@ -139,7 +139,7 @@ jobs:
       run: echo ${{ needs.create-legacy-ckpts.outputs.pl-version }} >> ${{ env.legacy_dir }}/back-compatible-versions.txt
 
     - name: Create Pull Request
-      uses: peter-evans/create-pull-request@v4
+      uses: peter-evans/create-pull-request@v5
       with:
         title: Adding test for legacy checkpoint created with ${{ needs.create-legacy-ckpts.outputs.pl-version }}
         delete-branch: true
diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
@@ -88,9 +88,9 @@ simplest way to do it is auto wrapping, which can serve as a drop-in replacement
 have to ``wrap`` layers manually as in the case of manual wrapping.
 
 .. note::
-    While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else
+    For users of PyTorch < 2.0: While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else
     PyTorch will raise an error. This is required because when you use auto-wrap, the model layers are sharded and your
-    ``lightning_module.parameters()`` will return a generator with no params. This inconvenience will be addressed in the future.
+    ``lightning_module.parameters()`` will return a generator with no params.
 
 
 .. code-block:: python
diff --git a/docs/source-pytorch/tutorials.rst b/docs/source-pytorch/tutorials.rst
@@ -0,0 +1,6 @@
+:orphan:
+
+PyTorch Lightning Tutorials
+===========================
+
+.. tutoriallist::
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -9,7 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added support for joint setup of model and optimizer with FSDP ([#17305](https://github.com/Lightning-AI/lightning/pull/17305))
+- Added support for handling multiple parameter groups in optimizers set up with FSDP ([#17305](https://github.com/Lightning-AI/lightning/pull/17305))
 
 
 ### Changed
diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
@@ -28,6 +28,7 @@
 from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, RandomSampler, SequentialSampler
 
 from lightning.fabric.loggers import Logger
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 
 from lightning.fabric.plugins import Precision  # avoid circular imports: # isort: split
 from lightning.fabric.accelerators.accelerator import Accelerator
@@ -798,7 +799,7 @@ def _validate_setup(self, module: nn.Module, optimizers: Sequence[Optimizer]) ->
         if any(isinstance(opt, _FabricOptimizer) for opt in optimizers):
             raise ValueError("An optimizer should be passed only once to the `setup` method.")
 
-        if isinstance(self._strategy, FSDPStrategy):
+        if isinstance(self._strategy, FSDPStrategy) and not _TORCH_GREATER_EQUAL_2_0:
             raise RuntimeError(
                 f"The `{type(self).__name__}` requires the model and optimizer(s) to be set up separately."
                 " Create and set up the model first through `model = self.setup_model(model)`. Then create the"
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
@@ -36,7 +36,11 @@
 )
 from lightning.fabric.utilities.distributed import group as _group
 from lightning.fabric.utilities.distributed import ReduceOp
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12, _TORCH_GREATER_EQUAL_1_13
+from lightning.fabric.utilities.imports import (
+    _TORCH_GREATER_EQUAL_1_12,
+    _TORCH_GREATER_EQUAL_1_13,
+    _TORCH_GREATER_EQUAL_2_0,
+)
 from lightning.fabric.utilities.rank_zero import rank_zero_only, rank_zero_warn
 from lightning.fabric.utilities.seed import reset_seed
 
@@ -101,7 +105,11 @@ def __init__(
         self._process_group_backend: Optional[str] = process_group_backend
         self._timeout: Optional[timedelta] = timeout
         self._backward_sync_control = _FSDPBackwardSyncControl()
-        self._ddp_kwargs = kwargs
+        self._fsdp_kwargs = kwargs
+
+        if _TORCH_GREATER_EQUAL_2_0:
+            # Enables joint setup of model and optimizer, multiple optimizer param groups, and `torch.compile()`
+            self._fsdp_kwargs.setdefault("use_orig_params", True)
 
         if activation_checkpointing and not _TORCH_GREATER_EQUAL_1_13:
             raise ValueError("Activation checkpointing requires torch >= 1.13.0. HINT: `pip install -U torch`")
@@ -157,28 +165,44 @@ def setup_environment(self) -> None:
     def setup_module_and_optimizers(
         self, module: Module, optimizers: List[Optimizer]
     ) -> Tuple[Module, List[Optimizer]]:
-        raise NotImplementedError(
-            f"The `{type(self).__name__}` does not support the joint setup of module and optimizer(s)."
-            " Please do it in this order: Create the model, call `setup_module`, create the optimizer,"
-            " call `setup_optimizer`."
-        )
+        """Wraps the model into a
+        :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module
+        and sets `use_orig_params=True` to keep the reference to the original parameters in the
+        optimizer.
+        """
+        if not _TORCH_GREATER_EQUAL_2_0:
+            raise NotImplementedError(
+                f"The `{type(self).__name__}` does not support the joint setup of module and optimizer(s)."
+                " Please do it in this order: Create the model, call `setup_module`, create the optimizer,"
+                " call `setup_optimizer`."
+            )
+        use_orig_params = self._fsdp_kwargs.get("use_orig_params")
+        if use_orig_params is False:
+            raise ValueError(
+                f"You set `{type(self).__name__}(use_orig_params=False)` but this is not supported when"
+                " setting the model and optimizer up jointly. Either set it to `True` or set the objects"
+                " up in this order: Create the model, call `setup_module`, create the optimizer,"
+                " call `setup_optimizer`."
+            )
+        module = self.setup_module(module)
+        return module, optimizers
 
     def setup_module(self, module: Module) -> "FullyShardedDataParallel":
         """Wraps the model into a
         :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module."""
         from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
 
-        if "auto_wrap_policy" in self._ddp_kwargs and any(
+        if "auto_wrap_policy" in self._fsdp_kwargs and any(
             isinstance(mod, FullyShardedDataParallel) for mod in module.modules()
         ):
             # If model is already wrapped, we need to avoid sending the `auto_wrap_policy`
-            del self._ddp_kwargs["auto_wrap_policy"]
+            del self._fsdp_kwargs["auto_wrap_policy"]
         wrapped_module = FullyShardedDataParallel(
             module=module,
             cpu_offload=self.cpu_offload,
             mixed_precision=self.mixed_precision_config,
             device_id=self.root_device.index,
-            **self._ddp_kwargs,
+            **self._fsdp_kwargs,
         )
 
         # activation checkpointing needs to be set up after wrapping the model
@@ -194,6 +218,9 @@ def setup_optimizer(self, optimizer: Optimizer) -> Optimizer:
         that the optimizer was created after the model was wrapped with :meth:`setup_module` with a reference to the
         flattened parameters.
         """
+        if _TORCH_GREATER_EQUAL_2_0:
+            return optimizer
+
         from torch.distributed.fsdp import FlatParameter
 
         num_groups = len(optimizer.param_groups)
@@ -224,7 +251,7 @@ def module_sharded_context(self) -> Generator:
             cpu_offload=self.cpu_offload,
             mixed_precision=self.mixed_precision_config,
             device_id=self.root_device.index,
-            **self._ddp_kwargs,
+            **self._fsdp_kwargs,
         ):
             yield
 
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -9,11 +9,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added support for multiple optimizer parameter groups when using the FSDP strategy ([#17309](https://github.com/Lightning-AI/lightning/pull/17309))
 
 
 ### Changed
 
+- Removed the limitation to call `self.trainer.model.parameters()` in `LightningModule.configure_optimizers()` ([#17309](https://github.com/Lightning-AI/lightning/pull/17309))
+
 - Generalized `Optimizer` validation to accommodate both FSDP 1.x and 2.x ([#16733](https://github.com/Lightning-AI/lightning/pull/16733))
 
 
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
@@ -32,7 +32,11 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12, _TORCH_GREATER_EQUAL_1_13
+from lightning.fabric.utilities.imports import (
+    _TORCH_GREATER_EQUAL_1_12,
+    _TORCH_GREATER_EQUAL_1_13,
+    _TORCH_GREATER_EQUAL_2_0,
+)
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import ProcessGroup, ReduceOp
@@ -130,6 +134,10 @@ def __init__(
             [activation_checkpointing] if not isinstance(activation_checkpointing, list) else activation_checkpointing
         )
         self.kwargs = kwargs
+        if _TORCH_GREATER_EQUAL_2_0:
+            # Avoids the need for user to reference params in `configure_optimizers` via
+            # `self.trainer.model.parameters()` and enables support for multiple parameter groups.
+            self.kwargs.setdefault("use_orig_params", True)
 
     @property
     def root_device(self) -> torch.device:
@@ -249,6 +257,9 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.setup_precision_plugin()
 
     def setup_optimizers(self, trainer: "pl.Trainer") -> None:
+        if self.kwargs.get("use_orig_params"):
+            return super().setup_optimizers(trainer)
+
         invalid_params_error = False
         try:
             super().setup_optimizers(trainer)
@@ -258,6 +269,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
             invalid_params_error = True
 
         if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
+            # We avoid this limitation in PyTorch >= 2.0 by setting `use_orig_params=True`
             raise ValueError(
                 "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
                 " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
diff --git a/tests/tests_fabric/helpers/models.py b/tests/tests_fabric/helpers/models.py
@@ -8,7 +8,6 @@
 from torch.utils.data import DataLoader, Dataset, IterableDataset
 
 from lightning.fabric import Fabric
-from lightning.fabric.strategies.fsdp import FSDPStrategy
 
 
 class RandomDataset(Dataset):
@@ -56,13 +55,8 @@ def after_optimizer_step(self, model: Module, optimizer: Optimizer) -> None:
 
     def run(self) -> None:
         model = self.get_model()
-        if isinstance(self.strategy, FSDPStrategy):
-            model = self.setup_module(model)
-            optimizer = self.get_optimizer(model)
-            optimizer = self.setup_optimizers(optimizer)
-        else:
-            optimizer = self.get_optimizer(model)
-            model, optimizer = self.setup(model, optimizer)
+        optimizer = self.get_optimizer(model)
+        model, optimizer = self.setup(model, optimizer)
 
         dataloader = self.get_dataloader()
         dataloader = self.setup_dataloaders(dataloader)
diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py
@@ -58,18 +58,42 @@ def test_fsdp_cpu_offload():
 
 
 @RunIf(min_torch="1.12")
-def test_fsdp_setup_optimizer_validation():
+@pytest.mark.parametrize("torch_ge_2_0", [False, True])
+def test_fsdp_setup_optimizer_validation(torch_ge_2_0):
     """Test that `setup_optimizer()` validates the param groups and reference to FSDP parameters."""
     module = nn.Linear(2, 2)
     strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")])
 
-    bad_optimizer = Adam([{"params": [module.weight]}, {"params": [module.bias], "lr": 1e-3}])
-    with pytest.raises(ValueError, match="does not support multiple param groups"):
-        strategy.setup_optimizer(bad_optimizer)
+    with mock.patch("lightning.fabric.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", torch_ge_2_0):
+        bad_optimizer_1 = Adam([{"params": [module.weight]}, {"params": [module.bias], "lr": 1e-3}])
+        bad_optimizer_2 = Adam(module.parameters())
 
-    bad_optimizer = Adam(module.parameters())
-    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameter"):
-        strategy.setup_optimizer(bad_optimizer)
+        if torch_ge_2_0:
+            strategy.setup_optimizer(bad_optimizer_1)
+            strategy.setup_optimizer(bad_optimizer_2)
+        else:
+            with pytest.raises(ValueError, match="does not support multiple param groups"):
+                strategy.setup_optimizer(bad_optimizer_1)
+            with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameter"):
+                strategy.setup_optimizer(bad_optimizer_2)
+
+
+@RunIf(min_torch="2.0.0")
+@mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.setup_module")
+def test_fsdp_setup_use_orig_params(_):
+    module = nn.Linear(2, 2)
+    optimizer = Adam(module.parameters())
+
+    strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")], use_orig_params=False)
+    assert not strategy._fsdp_kwargs["use_orig_params"]
+
+    with pytest.raises(ValueError, match=r"`FSDPStrategy\(use_orig_params=False\)` but this is not supported"):
+        strategy.setup_module_and_optimizers(module, optimizer)
+
+    strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")])
+    assert strategy._fsdp_kwargs["use_orig_params"]
+    strategy.setup_module_and_optimizers(module, optimizer)
+    assert strategy._fsdp_kwargs["use_orig_params"]
 
 
 @RunIf(min_torch="1.12")
diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py
@@ -16,17 +16,19 @@
 
 import pytest
 import torch
+from torch.nn import Parameter
 
 from lightning.fabric import Fabric
 from lightning.fabric.plugins import FSDPPrecision
 from lightning.fabric.strategies import FSDPStrategy
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12, _TORCH_GREATER_EQUAL_2_0
+from lightning.fabric.wrappers import _FabricOptimizer
 from tests_fabric.helpers.models import BoringFabric
 from tests_fabric.helpers.runif import RunIf
 
 if _TORCH_GREATER_EQUAL_1_12:
-    from torch.distributed.fsdp import FullyShardedDataParallel
-    from torch.distributed.fsdp.wrap import wrap
+    from torch.distributed.fsdp import FlatParameter, FullyShardedDataParallel
+    from torch.distributed.fsdp.wrap import always_wrap_policy, wrap
 
 
 def _get_model():
@@ -81,32 +83,13 @@ def _assert_save_equality(fabric, model, ckpt_path):
         assert torch.allclose(current_param.float().cpu(), loaded_param.cpu())
 
 
-if _TORCH_GREATER_EQUAL_2_0:
-
-    def _custom_auto_wrap_policy(
-        module,
-        recurse,
-        nonwrapped_numel: int,
-    ) -> bool:
-        return nonwrapped_numel >= 2
-
-else:
-
-    def _custom_auto_wrap_policy(
-        module,
-        recurse,
-        unwrapped_params: int,
-    ) -> bool:
-        return unwrapped_params >= 2
-
-
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.13")
 @pytest.mark.parametrize("precision", ("16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))))
 @pytest.mark.parametrize("manual_wrapping", [True, False])
 def test_fsdp_train_save_load(manual_wrapping, precision):
     """Test FSDP training, saving and loading with different wrapping and precision settings."""
     strategy = FSDPStrategy(
-        auto_wrap_policy=_custom_auto_wrap_policy,
+        auto_wrap_policy=always_wrap_policy,
         activation_checkpointing=[torch.nn.Linear],
     )
     fabric_cls = _MyFabricManualWrapping if manual_wrapping else _MyFabric
@@ -126,7 +109,7 @@ def test_fsdp_train_save_load(manual_wrapping, precision):
 def test_setup_module_move_to_device(fabric_module_mock, move_to_device):
     """Test that `move_to_device` does nothing, FSDP decides which device parameters get moved to which device
     (sharding)."""
-    strategy = FSDPStrategy(auto_wrap_policy=_custom_auto_wrap_policy)
+    strategy = FSDPStrategy(auto_wrap_policy=always_wrap_policy)
     fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
     fabric.launch()
 
@@ -138,7 +121,46 @@ def test_setup_module_move_to_device(fabric_module_mock, move_to_device):
     # the linear layer got sharded and each part is on the expected device
     assert next(fabric_model.parameters()).device == torch.device("cuda", fabric.local_rank)
     assert next(fabric_model.parameters()).numel() == 50
+    if _TORCH_GREATER_EQUAL_2_0:
+        # In PyTorch >= 2.0 we set `use_orig_params=True` and don't see flattened parameters
+        assert isinstance(next(fabric_model.parameters()), Parameter)
+    else:
+        assert isinstance(next(fabric_model.parameters()), FlatParameter)
 
     # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for sharded models
     assert fabric_model.device == torch.device("cpu")
     assert fabric.device == torch.device("cuda", fabric.local_rank)
+
+
+@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="2.0.0")
+def test_setup_with_orig_params_and_multiple_param_groups():
+    """Test that Fabric sets `use_orig_params` for the user when jointly setting up model and optimizer."""
+    strategy = FSDPStrategy(auto_wrap_policy=always_wrap_policy)
+    fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
+    fabric.launch()
+
+    model = torch.nn.Sequential(
+        torch.nn.Linear(10, 10, bias=False),
+        torch.nn.Linear(5, 2, bias=False),
+    )
+    optimizer = torch.optim.Adam(
+        [
+            {"params": model[0].parameters(), "lr": 1e-2},
+            {"params": model[1].parameters(), "lr": 1e-6},
+        ]
+    )
+
+    # set up model and optimizer jointly
+    wrapped_model, wrapped_optimizer = fabric.setup(model, optimizer)
+
+    assert fabric.strategy._fsdp_kwargs["use_orig_params"]
+    assert isinstance(wrapped_optimizer, _FabricOptimizer)
+    assert len(wrapped_optimizer.param_groups) == 2
+    for i in range(2):
+        layer = wrapped_model._forward_module.module[i]
+        assert isinstance(layer, FullyShardedDataParallel)
+        assert torch.equal(wrapped_optimizer.param_groups[i]["params"][0], layer.weight)
+
+        # A regular parameter as a view into the flattened parameters
+        assert isinstance(layer.weight, torch.nn.Parameter)
+        assert not isinstance(layer.weight, FlatParameter)
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py