materialize

awaelchli · awaelchli · commit d3192706c512 · 2023-08-23T22:35:55.000-04:00
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
@@ -53,7 +53,7 @@
     _TORCH_GREATER_EQUAL_2_0,
 )
 from lightning.fabric.utilities.init import _EmptyInit
-from lightning.fabric.utilities.load import _lazy_load
+from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import _PATH, ProcessGroup, ReduceOp
@@ -574,7 +574,12 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
 
         if _is_full_checkpoint(path):
             checkpoint = _lazy_load(path) if _TORCH_GREATER_EQUAL_2_0 else torch.load(path, map_location="cpu")
-            _load_raw_module_state(checkpoint["state_dict"], module=self.model, world_size=self.world_size)
+            _load_raw_module_state(checkpoint.pop("state_dict"), module=self.model, world_size=self.world_size)
+
+            if _TORCH_GREATER_EQUAL_2_0:
+                # Materialize lazy tensors if there are any left in the checkpoint
+                # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues
+                checkpoint = _materialize_tensors(checkpoint)
 
             from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
             from torch.distributed.fsdp import OptimStateKeyType