Lightning-AI · SeanNaren · Oct 11, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 18, 2020
@@ -36,6 +36,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439))
 
 
+- Added ability for DDP plugin to modify optimizer state saving ([#4675](https://github.com/PyTorchLightning/pytorch-lightning/pull/4675))
+
+
 ### Changed
 
 - Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903))

@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 from enum import Enum
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, List
 
 import torch
 from torch.optim import Optimizer
@@ -202,6 +202,17 @@ def sync_tensor(self,
         """
         raise NotImplementedError()
 
+    def optimizer_state(self, optimizer: Optimizer) -> dict:
+        """
+        Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom
+        plugins.
+        Return:
+            Optimizer state dict
+        """
+        if self.ddp_plugin:
+            return self.ddp_plugin.optimizer_state(optimizer)
+        return optimizer.state_dict()
+
     def __getstate__(self):
         return {
             'trainer': self.trainer,

@@ -195,8 +195,7 @@ def select_accelerator(self):
         use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks
 
         # torchelastic or general non_slurm ddp
-        te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
-        use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed
+        use_torchelastic_ddp = self.trainer.use_ddp and self._is_using_torchelastic()
 
         use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn"
         use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu"

@@ -61,21 +61,23 @@ def train(self):
         return self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model)
 
     def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def _step(self, args):
+        args = self.ddp_plugin.on_before_forward(args, self.trainer.get_model())
         if self.trainer.amp_backend == AMPType.NATIVE:
             with torch.cuda.amp.autocast():
                 output = self.trainer.model(*args)
         else:
             output = self.trainer.model(*args)
         return output
 
-    def validation_step(self, args):
-        output = self.training_step(args)
-        return output
-
-    def test_step(self, args):
-        output = self.training_step(args)
-        return output
-
     def barrier(self, name: Optional[str] = None):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()

@@ -151,21 +151,23 @@ def train(self):
         return results
 
     def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def _step(self, args):
+        args = self.ddp_plugin.on_before_forward(args, self.trainer.get_model())
         if self.trainer.amp_backend == AMPType.NATIVE:
             with torch.cuda.amp.autocast():
                 output = self.trainer.model(*args)
         else:
             output = self.trainer.model(*args)
         return output
 
-    def validation_step(self, args):
-        output = self.training_step(args)
-        return output
-
-    def test_step(self, args):
-        output = self.training_step(args)
-        return output
-
     def barrier(self, name: Optional[str] = None):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()

@@ -156,21 +156,23 @@ def ddp_train(self, process_idx, mp_queue, model):
         torch.cuda.empty_cache()
 
     def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def _step(self, args):
+        args = self.ddp_plugin.on_before_forward(args, self.trainer.get_model())
         if self.trainer.amp_backend == AMPType.NATIVE:
             with torch.cuda.amp.autocast():
                 output = self.trainer.model(*args)
         else:
             output = self.trainer.model(*args)
         return output
 
-    def validation_step(self, args):
-        output = self.training_step(args)
-        return output
-
-    def test_step(self, args):
-        output = self.training_step(args)
-        return output
-
     def barrier(self, name: Optional[str] = None):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()

@@ -77,21 +77,23 @@ def get_device_ids(self):
         return device_ids
 
     def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def _step(self, args):
+        args = self.ddp_plugin.on_before_forward(args, self.trainer.get_model())
         if self.trainer.amp_backend == AMPType.NATIVE:
             with torch.cuda.amp.autocast():
                 output = self.trainer.model(*args)
         else:
             output = self.trainer.model(*args)
         return output
 
-    def validation_step(self, args):
-        output = self.training_step(args)
-        return output
-
-    def test_step(self, args):
-        output = self.training_step(args)
-        return output
-
     def barrier(self, name: Optional[str] = None):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()

@@ -182,21 +182,23 @@ def get_device_ids(self):
         return device_ids
 
     def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def _step(self, args):
+        args = self.ddp_plugin.on_before_forward(args, self.trainer.get_model())
         if self.trainer.amp_backend == AMPType.NATIVE:
             with torch.cuda.amp.autocast():
                 output = self.trainer.model(*args)
         else:
             output = self.trainer.model(*args)
         return output
 
-    def validation_step(self, args):
-        output = self.training_step(args)
-        return output
-
-    def test_step(self, args):
-        output = self.training_step(args)
-        return output
-
     def barrier(self, name: Optional[str] = None):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()
@@ -270,3 +272,10 @@ def sync_tensor(self,
                     group: Optional[Any] = None,
                     reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
         return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def sync_optim_state(self):
+        self.ddp_plugin.sync_optim_state(self.trainer.get_model())
+
+    @property
+    def rank_should_save_optim_state(self):
+        return self.ddp_plugin.rank_should_save_optim_state(self.trainer.global_rank)
@@ -1,5 +1,7 @@
 from typing import List, Dict, Any
 
+from torch.optim import Optimizer
+
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 
@@ -62,3 +64,17 @@ def configure_ddp(self, model, device_ids):
             **self._ddp_kwargs,
         )
         return model
+
+    def on_before_forward(self, args: Any, model: LightningModule):
+        """
+        Override to handle custom input to device logic. For DDP, no logic is required as this is handled internally
+        within the DDP wrapper.
+        Args:
+            args: Inputs to the model.
+            model: Model to train.
+        Returns: args moved to correct device if needed.
+        """
+        return args
+
+    def optimizer_state(self, optimizer: Optimizer) -> dict:
+        return optimizer.state_dict()
@@ -56,6 +56,14 @@ def training_step(self, fx, args):
             output = fx(*args)
         return output
 
+    @property
+    def scaler(self):
+        return torch.cuda.amp.GradScaler()
+
     def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
         model = self.trainer.get_model()
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
+
+    @property
+    def scaler(self):
+        return torch.cuda.amp.GradScaler()
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import abc
 from typing import Union
 
 from torch.optim import Optimizer
 
+import abc
+
 
 class PrecisionPlugin(abc.ABC):
     """

@@ -298,10 +298,12 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             callback_states = self.trainer.on_save_checkpoint()
             checkpoint['callbacks'] = callback_states
 
-            # dump optimizers
             optimizer_states = []
             for i, optimizer in enumerate(self.trainer.optimizers):
-                optimizer_states.append(optimizer.state_dict())
+                # Rely on accelerator to dump optimizer state
+                optimizer_state = self.trainer.accelerator_backend.optimizer_state(optimizer)
+                optimizer_states.append(optimizer_state)
+
             checkpoint['optimizer_states'] = optimizer_states
 
             # dump lr schedulers

@@ -22,7 +22,6 @@
     LightningDataParallel,
 )
 
-
 class ModelConnector:
     def __init__(self, trainer):
         self.trainer = trainer
@@ -55,6 +54,13 @@ def copy_trainer_model_properties(self, model):
             m.local_rank = self.trainer.local_rank
 
     def get_model(self):
-        is_dp_module = isinstance(self.trainer.model, (LightningDistributedDataParallel, LightningDataParallel))
+        is_dp_module = isinstance(
+            self.trainer.model,
+            (
+                LightningShardedDataParallel,
+                LightningDistributedDataParallel,
+                LightningDataParallel
+            )
+        )
         model = self.trainer.model.module if is_dp_module else self.trainer.model
         return model
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.plugins.apex import ApexPlugin
 from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.sharded_native_amp_plugin import ShardedNativeAMPPlugin
+from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, AMPType, rank_zero_warn
 
 
@@ -24,7 +27,7 @@ def __init__(self, trainer):
         self.trainer = trainer
         self.backend = None
 
-    def on_trainer_init(self, precision, amp_level, amp_backend):
+    def on_trainer_init(self, precision, amp_level, amp_backend, plugins):
         # AMP init
         # These are the only lines needed after v0.8.0
         # we wrap the user's forward with autocast and give it back at the end of fit
@@ -33,14 +36,14 @@ def on_trainer_init(self, precision, amp_level, amp_backend):
         self.trainer.scaler = None
 
         self.trainer.amp_level = amp_level
-        self.init_amp(amp_backend)
+        self.init_amp(amp_backend, plugins)
 
-    def init_amp(self, amp_type: str):
+    def init_amp(self, amp_type: str, plugins: Optional[list]):
         assert self.trainer.precision in (16, 32), 'only 32 or 16 bit precision supported'
         self.trainer.amp_backend = None
-        self._setup_amp_backend(amp_type)
+        self._setup_amp_backend(amp_type, plugins)
 
-    def _setup_amp_backend(self, amp_type: str):
+    def _setup_amp_backend(self, amp_type: str, plugins: Optional[list]):
         if self.trainer.precision != 16:
             # no AMP requested, so we can leave now
             return
@@ -54,9 +57,14 @@ def _setup_amp_backend(self, amp_type: str):
                                ' We will attempt to use NVIDIA Apex for this session.')
                 amp_type = 'apex'
             else:
-                log.info('Using native 16bit precision.')
                 self.trainer.amp_backend = AMPType.NATIVE
-                self.backend = NativeAMPPlugin(self.trainer)
+                log.info('Using native 16bit precision.')
+
+                if plugins and self._sharded_in_plugins(plugins):
+                    log.info('Using Sharded 16bit plugin.')
+                    self.backend = ShardedNativeAMPPlugin(self.trainer)
+                else:
+                    self.backend = NativeAMPPlugin(self.trainer)
 
         if amp_type == 'apex':
             if not APEX_AVAILABLE:
@@ -79,3 +87,14 @@ def connect(self, model):
             self.trainer.optimizers = optimizers
 
         return model
+
+    @property
+    def scaler(self):
+        if self.trainer.amp_backend == AMPType.NATIVE and self.trainer.precision == 16 and not self.trainer.use_tpu:
+            return self.backend.scaler
+
+    def _sharded_in_plugins(self, plugins):
+        for plugin in plugins:
+            if isinstance(plugin, DDPShardedPlugin):
+                return True
+        return False
@@ -400,7 +400,7 @@ def __init__(
         )
 
         # set precision
-        self.precision_connector.on_trainer_init(precision, amp_level, amp_backend)
+        self.precision_connector.on_trainer_init(precision, amp_level, amp_backend, plugins)
 
         # last thing are the plugins which override whatever the trainer used by default
         self.plugin_connector.on_trainer_init(plugins)