Simplified setup of optimizers in FSDP (#17309)

awaelchli · pre-commit-ci[bot] · web-flow · commit 0c02c44c6d0f · 2023-04-11T20:13:41.000Z
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
@@ -88,9 +88,9 @@ simplest way to do it is auto wrapping, which can serve as a drop-in replacement
 have to ``wrap`` layers manually as in the case of manual wrapping.
 
 .. note::
-    While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else
+    For users of PyTorch < 2.0: While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else
     PyTorch will raise an error. This is required because when you use auto-wrap, the model layers are sharded and your
-    ``lightning_module.parameters()`` will return a generator with no params. This inconvenience will be addressed in the future.
+    ``lightning_module.parameters()`` will return a generator with no params.
 
 
 .. code-block:: python
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -9,11 +9,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added support for multiple optimizer parameter groups when using the FSDP strategy ([#17309](https://github.com/Lightning-AI/lightning/pull/17309))
 
 
 ### Changed
 
+- Removed the limitation to call `self.trainer.model.parameters()` in `LightningModule.configure_optimizers()` ([#17309](https://github.com/Lightning-AI/lightning/pull/17309))
+
 - Generalized `Optimizer` validation to accommodate both FSDP 1.x and 2.x ([#16733](https://github.com/Lightning-AI/lightning/pull/16733))
 
 
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
@@ -32,7 +32,11 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12, _TORCH_GREATER_EQUAL_1_13
+from lightning.fabric.utilities.imports import (
+    _TORCH_GREATER_EQUAL_1_12,
+    _TORCH_GREATER_EQUAL_1_13,
+    _TORCH_GREATER_EQUAL_2_0,
+)
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import ProcessGroup, ReduceOp
@@ -130,6 +134,10 @@ def __init__(
             [activation_checkpointing] if not isinstance(activation_checkpointing, list) else activation_checkpointing
         )
         self.kwargs = kwargs
+        if _TORCH_GREATER_EQUAL_2_0:
+            # Avoids the need for user to reference params in `configure_optimizers` via
+            # `self.trainer.model.parameters()` and enables support for multiple parameter groups.
+            self.kwargs.setdefault("use_orig_params", True)
 
     @property
     def root_device(self) -> torch.device:
@@ -249,6 +257,9 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.setup_precision_plugin()
 
     def setup_optimizers(self, trainer: "pl.Trainer") -> None:
+        if self.kwargs.get("use_orig_params"):
+            return super().setup_optimizers(trainer)
+
         invalid_params_error = False
         try:
             super().setup_optimizers(trainer)
@@ -258,6 +269,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
             invalid_params_error = True
 
         if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers):
+            # We avoid this limitation in PyTorch >= 2.0 by setting `use_orig_params=True`
             raise ValueError(
                 "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the"
                 " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the"
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -1,4 +1,5 @@
 import os
+from contextlib import nullcontext
 from functools import partial
 from typing import Any, Callable, Dict, Optional
 from unittest import mock
@@ -90,7 +91,8 @@ def __init__(self):
         self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2))
 
     def configure_optimizers(self):
-        return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1)
+        parameters = self.parameters() if _TORCH_GREATER_EQUAL_2_0 else self.trainer.model.parameters()
+        return torch.optim.SGD(parameters, lr=0.1)
 
     def on_train_batch_end(self, *_) -> None:
         self._assert_layer_fsdp_instance()
@@ -297,14 +299,24 @@ def test_fsdp_checkpoint_multi_gpus(tmpdir, model, strategy, strategy_cfg):
 
 @RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12")
 def test_invalid_parameters_in_optimizer():
-    trainer = Trainer(strategy="fsdp", accelerator="cuda", devices=1)
+    trainer = Trainer(
+        strategy="fsdp",
+        accelerator="cuda",
+        devices=1,
+        fast_dev_run=1,
+    )
+    error_context = (
+        nullcontext()
+        if _TORCH_GREATER_EQUAL_2_0
+        else pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters")
+    )
 
     class EmptyParametersModel(BoringModel):
         def configure_optimizers(self):
             return torch.optim.Adam(self.parameters(), lr=1e-2)
 
     model = EmptyParametersModel()
-    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
+    with error_context:
         trainer.fit(model)
 
     class NoFlatParametersModel(BoringModel):
@@ -313,7 +325,7 @@ def configure_optimizers(self):
             return torch.optim.Adam(layer.parameters(), lr=1e-2)
 
     model = NoFlatParametersModel()
-    with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters"):
+    with error_context:
         trainer.fit(model)
 
 
@@ -370,3 +382,17 @@ def test_fsdp_strategy_cpu_offload():
     config = CPUOffload()
     strategy = FSDPStrategy(cpu_offload=config)
     assert strategy.cpu_offload == config
+
+
+@RunIf(min_torch="1.12")
+def test_fsdp_use_orig_params():
+    """Test that Lightning enables `use_orig_params` in PyTorch >= 2.0."""
+    with mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", False):
+        strategy = FSDPStrategy()
+        assert "use_orig_params" not in strategy.kwargs
+
+    with mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", True):
+        strategy = FSDPStrategy()
+        assert strategy.kwargs["use_orig_params"]
+        strategy = FSDPStrategy(use_orig_params=False)
+        assert not strategy.kwargs["use_orig_params"]