Fix double precision + ddp_spawn (#6924)

ethanwharris · Borda · web-flow · commit 03bb389b2180 · 2021-06-01T15:21:17.000Z
* Initial fix

* Initial fix

* Initial fix

* Updates

* Updates

* Update typing and docs

* Undo accidental refactor

* Remove unused imports

* Add DDP double precision test

* Remove unused variable

* Update CHANGELOG.md

* Fix test

* Update tests

* Formatting

* Revert bad change

* Add back changes

* Correct wrapping order

* Improve unwrapping

* Correct wrapping order

* Fix... finally

* Respond to comments

* Drop ddp test

* Simplify ddp spawn test

* Simplify ddp spawn test

Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -179,6 +179,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed formatting of info message when max training time reached ([#7780](https://github.com/PyTorchLightning/pytorch-lightning/pull/7780))
 
 
+- Fixed a bug where `precision=64` with `accelerator='ddp_spawn'` would throw a pickle error ([#6924](https://github.com/PyTorchLightning/pytorch-lightning/pull/6924))
+
+
 ## [1.3.2] - 2021-05-18
 
 ### Changed
diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Any, Union
+
 import torch
 from torch.nn import DataParallel
 from torch.nn.parallel import DistributedDataParallel
@@ -19,9 +21,44 @@
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 
 
-class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module):
+class _LightningPrecisionModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module):
 
     def __init__(self, pl_module: 'pl.LightningModule') -> None:
+        """
+        Wraps the user's LightningModule. Requires overriding all ``*_step`` methods and ``forward`` so that it can
+        safely be wrapped by a ``_LightningModuleWrapperBase`` and a ``*DataParallel``.
+
+        Args:
+            pl_module: the model to wrap
+        """
+        super().__init__()
+        self.module = pl_module
+
+        # set the parameters_to_ignore from LightningModule.
+        self._ddp_params_and_buffers_to_ignore = getattr(pl_module, "_ddp_params_and_buffers_to_ignore", [])
+
+    def training_step(self, *args: Any, **kwargs: Any) -> Any:
+        raise NotImplementedError
+
+    def validation_step(self, *args: Any, **kwargs: Any) -> Any:
+        raise NotImplementedError
+
+    def test_step(self, *args: Any, **kwargs: Any) -> Any:
+        raise NotImplementedError
+
+    def predict_step(self, *args: Any, **kwargs: Any) -> Any:
+        raise NotImplementedError
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        raise NotImplementedError
+
+    def on_post_move_to_device(self) -> None:
+        pass
+
+
+class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module):
+
+    def __init__(self, pl_module: Union['pl.LightningModule', _LightningPrecisionModuleWrapperBase]):
         """
         Wraps the user's LightningModule and redirects the forward call to the appropriate
         method, either ``training_step``, ``validation_step`` or ``test_step``.
@@ -39,8 +76,9 @@ def __init__(self, pl_module: 'pl.LightningModule') -> None:
         # set the parameters_to_ignore from LightningModule.
         self._ddp_params_and_buffers_to_ignore = getattr(pl_module, "_ddp_params_and_buffers_to_ignore", [])
 
-    def forward(self, *inputs, **kwargs):
-        trainer = self.module.trainer
+    def forward(self, *inputs: Any, **kwargs: Any) -> Any:
+        lightning_module = unwrap_lightning_module(self.module)
+        trainer = lightning_module.trainer
 
         if trainer and trainer.training:
             output = self.module.training_step(*inputs, **kwargs)
@@ -49,7 +87,7 @@ def forward(self, *inputs, **kwargs):
             # it is done manually in ``LightningModule.manual_backward``
             # `require_backward_grad_sync` will be reset in the
             # ddp_plugin ``post_training_step`` hook
-            if not self.module.automatic_optimization:
+            if not lightning_module.automatic_optimization:
                 trainer.model.require_backward_grad_sync = False
         elif trainer and trainer.testing:
             output = self.module.test_step(*inputs, **kwargs)
@@ -62,14 +100,14 @@ def forward(self, *inputs, **kwargs):
 
         return output
 
-    def on_post_move_to_device(self):
+    def on_post_move_to_device(self) -> None:
         pass
 
 
 def unwrap_lightning_module(wrapped_model) -> 'pl.LightningModule':
     model = wrapped_model
     if isinstance(model, (DistributedDataParallel, DataParallel)):
-        model = model.module
-    if isinstance(model, _LightningModuleWrapperBase):
-        model = model.module
+        model = unwrap_lightning_module(model.module)
+    if isinstance(model, (_LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase)):
+        model = unwrap_lightning_module(model.module)
     return model
diff --git a/pytorch_lightning/plugins/precision/double.py b/pytorch_lightning/plugins/precision/double.py
@@ -12,28 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from functools import wraps
 from typing import Any, Generator, List, Tuple
 
 import torch
 import torch.nn as nn
 from torch.optim import Optimizer
 
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 
-class _DoublePrecisionPatch:
-    """Class to handle patching of methods in the ``LightningModule`` and subsequent teardown."""
+class LightningDoublePrecisionModule(_LightningPrecisionModuleWrapperBase):
+    """
+    LightningModule wrapper which converts incoming floating point data in ``*_step`` and ``forward`` to double
+    (``torch.float64``) precision.
 
-    def __init__(self, model: nn.Module, method_name: str, old_method: Any) -> None:
-        self.model = model
-        self.method_name = method_name
-        self.old_method = old_method
+    Args:
+        pl_module: the model to wrap
+    """
 
-    def teardown(self) -> None:
-        setattr(self.model, self.method_name, self.old_method)
+    def __init__(self, pl_module: LightningModule):
+        super().__init__(pl_module)
 
     @staticmethod
     def _to_double_precision(data: torch.Tensor) -> torch.Tensor:
@@ -43,55 +44,63 @@ def _to_double_precision(data: torch.Tensor) -> torch.Tensor:
 
     @staticmethod
     def _move_float_tensors_to_double(collection: Any) -> Any:
-        return apply_to_collection(collection, torch.Tensor, function=_DoublePrecisionPatch._to_double_precision)
-
-    @classmethod
-    def patch(cls, model: nn.Module, method_name: str) -> '_DoublePrecisionPatch':
-        old_method = getattr(model, method_name)
-
-        @wraps(old_method)
-        def new_method(*args: Any, **kwargs: Any) -> Any:
-            return old_method(
-                *_DoublePrecisionPatch._move_float_tensors_to_double(args),
-                **_DoublePrecisionPatch._move_float_tensors_to_double(kwargs)
-            )
-
-        setattr(model, method_name, new_method if callable(old_method) else old_method)
-        return cls(model, method_name, old_method)
+        return apply_to_collection(
+            collection,
+            torch.Tensor,
+            LightningDoublePrecisionModule._to_double_precision,
+        )
+
+    def training_step(self, *args: Any, **kwargs: Any) -> Any:
+        return self.module.training_step(
+            *LightningDoublePrecisionModule._move_float_tensors_to_double(args),
+            **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs),
+        )
+
+    def validation_step(self, *args: Any, **kwargs: Any) -> Any:
+        return self.module.validation_step(
+            *LightningDoublePrecisionModule._move_float_tensors_to_double(args),
+            **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs),
+        )
+
+    def test_step(self, *args: Any, **kwargs: Any) -> Any:
+        return self.module.test_step(
+            *LightningDoublePrecisionModule._move_float_tensors_to_double(args),
+            **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs),
+        )
+
+    def predict_step(self, *args: Any, **kwargs: Any) -> Any:
+        return self.module.predict_step(
+            *LightningDoublePrecisionModule._move_float_tensors_to_double(args),
+            **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs),
+        )
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        return self.module(
+            *LightningDoublePrecisionModule._move_float_tensors_to_double(args),
+            **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs),
+        )
 
 
 class DoublePrecisionPlugin(PrecisionPlugin):
-    """Plugin for training with double (``torch.float64``) precision."""
+    """ Plugin for training with double (``torch.float64``) precision. """
 
     precision: int = 64
 
-    def __init__(self) -> None:
-        super().__init__()
-        self.patches: List[_DoublePrecisionPatch] = []
-
     def connect(
         self,
         model: nn.Module,
         optimizers: List[Optimizer],
         lr_schedulers: List[Any],
-    ) -> Tuple[nn.Module, List[Optimizer], List[Any]]:
-        """Converts the model to double precision and wraps the `training_step`, `validation_step`, `test_step`,
-        `predict_step`, and `forward` methods to convert incoming floating point data to double. Does not alter
-        `optimizers` or `lr_schedulers`."""
+    ) -> Tuple[nn.Module, List['Optimizer'], List[Any]]:
+        """Converts the model to double precision and wraps it in a ``LightningDoublePrecisionModule`` to convert
+        incoming floating point data to double (``torch.float64``) precision. Does not alter `optimizers` or
+        `lr_schedulers`.
+        """
         model = model.to(dtype=torch.float64)
-        if isinstance(model, LightningModule):
-            self.patches.append(_DoublePrecisionPatch.patch(model, 'training_step'))
-            self.patches.append(_DoublePrecisionPatch.patch(model, 'validation_step'))
-            self.patches.append(_DoublePrecisionPatch.patch(model, 'test_step'))
-            self.patches.append(_DoublePrecisionPatch.patch(model, 'predict_step'))
-        self.patches.append(_DoublePrecisionPatch.patch(model, 'forward'))
+        model = LightningDoublePrecisionModule(model)
 
         return super().connect(model, optimizers, lr_schedulers)
 
-    def post_dispatch(self) -> None:
-        while len(self.patches) > 0:
-            self.patches.pop().teardown()
-
     @contextmanager
     def train_step_context(self) -> Generator[None, None, None]:
         """
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -161,19 +161,19 @@ def start_predicting(self, trainer: 'pl.Trainer') -> None:
         self._results = trainer.run_stage()
 
     def training_step(self, *args, **kwargs):
-        return self.lightning_module.training_step(*args, **kwargs)
+        return self.model.training_step(*args, **kwargs)
 
     def post_training_step(self):
         pass
 
     def validation_step(self, *args, **kwargs):
-        return self.lightning_module.validation_step(*args, **kwargs)
+        return self.model.validation_step(*args, **kwargs)
 
     def test_step(self, *args, **kwargs):
-        return self.lightning_module.test_step(*args, **kwargs)
+        return self.model.test_step(*args, **kwargs)
 
     def predict_step(self, *args, **kwargs):
-        return self.lightning_module.predict_step(*args, **kwargs)
+        return self.model.predict_step(*args, **kwargs)
 
     def training_step_end(self, output):
         return output
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
@@ -123,7 +123,8 @@ def setup(self, stage: Optional[str] = None) -> None:
 
 
 @RunIf(min_gpus=2, min_torch="1.8.1", special=True)
-def test_ddp_wrapper(tmpdir):
+@pytest.mark.parametrize("precision", [16, 32])
+def test_ddp_wrapper(tmpdir, precision):
     """
     Test parameters to ignore are carried over for DDP.
     """
@@ -150,5 +151,12 @@ def on_train_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule')
             assert trainer.training_type_plugin.model.module._ddp_params_and_buffers_to_ignore == ('something')
 
     model = CustomModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=CustomCallback())
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        precision=precision,
+        accelerator="ddp",
+        gpus=2,
+        callbacks=CustomCallback(),
+    )
     trainer.fit(model)
diff --git a/tests/overrides/test_base.py b/tests/overrides/test_base.py
@@ -0,0 +1,44 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import torch
+from torch.nn import DataParallel
+
+from pytorch_lightning.overrides.base import (
+    _LightningModuleWrapperBase,
+    _LightningPrecisionModuleWrapperBase,
+    unwrap_lightning_module,
+)
+from tests.helpers import BoringModel
+
+
+@pytest.mark.parametrize("wrapper_class", [
+    _LightningModuleWrapperBase,
+    _LightningPrecisionModuleWrapperBase,
+])
+def test_wrapper_device_dtype(wrapper_class):
+    model = BoringModel()
+    wrapped_model = wrapper_class(model)
+
+    wrapped_model.to(dtype=torch.float16)
+    assert model.dtype == torch.float16
+
+
+def test_unwrap_lightning_module():
+    model = BoringModel()
+    wrapped_model = _LightningPrecisionModuleWrapperBase(model)
+    wrapped_model = _LightningModuleWrapperBase(wrapped_model)
+    wrapped_model = DataParallel(wrapped_model)
+
+    assert unwrap_lightning_module(wrapped_model) == model
diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py
@@ -11,12 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import pickle
+from unittest.mock import MagicMock
+
 import pytest
 import torch
 from torch.utils.data import DataLoader, Dataset
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.plugins import DoublePrecisionPlugin
 from tests.helpers.boring_model import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 
 
 class RandomFloatIntDataset(Dataset):
@@ -121,7 +126,6 @@ def predict_dataloader(self):
 @pytest.mark.parametrize('boring_model', (DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward))
 def test_double_precision(tmpdir, boring_model):
     model = boring_model()
-    original_training_step = model.training_step
 
     trainer = Trainer(
         max_epochs=2,
@@ -134,4 +138,25 @@ def test_double_precision(tmpdir, boring_model):
     trainer.test(model)
     trainer.predict(model)
 
-    assert model.training_step == original_training_step
+
+@RunIf(min_gpus=2)
+def test_double_precision_ddp(tmpdir):
+    model = DoublePrecisionBoringModel()
+
+    trainer = Trainer(
+        max_epochs=1,
+        default_root_dir=tmpdir,
+        accelerator='ddp_spawn',
+        gpus=2,
+        fast_dev_run=2,
+        precision=64,
+        log_every_n_steps=1,
+    )
+    trainer.fit(model)
+
+
+def test_double_precision_pickle(tmpdir):
+    model = BoringModel()
+    plugin = DoublePrecisionPlugin()
+    model, _, __ = plugin.connect(model, MagicMock(), MagicMock())
+    pickle.dumps(model)