Rely on ddp plugin for blocking sync behaviour, and skip if we're using manual optimization

SeanNaren · SeanNaren · commit 072c272ddeae · 2020-12-03T23:05:33.000Z
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -14,6 +14,7 @@
 
 from enum import Enum
 from typing import Any, Optional, Union
+from contextlib import contextmanager
 
 import torch
 from torch.optim import Optimizer
@@ -244,6 +245,18 @@ def __setstate__(self, d):
     def on_save(self, checkpoint):
         return checkpoint
 
+    @contextmanager
+    def block_ddp_plugin_sync_behaviour(self):
+        """
+        Blocks ddp sync gradients behaviour on backwards pass.
+        This is useful for skipping sync when accumulating gradients, reducing communication overhead
+        Returns: context manager with sync behaviour off
+        """
+        if self.ddp_plugin:
+            yield self.ddp_plugin.block_backward_sync(self.trainer.model)
+        else:
+            yield
+
 
 # TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos...
 class BackendType(Enum):
diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/ddp_plugin.py
@@ -1,4 +1,5 @@
 import os
+from contextlib import contextmanager
 from typing import Any, Dict, List, Union, Optional
 
 import torch.distributed as torch_distrib
@@ -131,3 +132,12 @@ def get_model_from_plugin(
         if isinstance(model, LightningDistributedDataParallel):
             return model.module
         return model
+
+    @contextmanager
+    def block_backward_sync(self, model: LightningDistributedDataParallel):
+        """
+        Blocks ddp sync gradients behaviour on backwards pass.
+        This is useful for skipping sync when accumulating gradients, reducing communication overhead
+        Returns: context manager with sync behaviour off
+        """
+        yield model.no_sync()
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -668,8 +668,25 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
                     # -------------------
 
                     # perform dpp sync only when performing optimizer_step
-                    with self.block_ddp_sync_behaviour():
-                        self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens)
+                    if self.automatic_optimization:
+                        with self.block_ddp_sync_behaviour():
+                            self.training_step_and_backward(
+                                split_batch,
+                                batch_idx,
+                                opt_idx,
+                                optimizer,
+                                self.trainer.hiddens
+                            )
+                    else:
+                        # do not block ddp gradient sync when using manual optimization
+                        # as gradients are needed within the training step
+                        self.training_step_and_backward(
+                            split_batch,
+                            batch_idx,
+                            opt_idx,
+                            optimizer,
+                            self.trainer.hiddens
+                        )
 
                     batch_outputs = self._process_closure_result(
                         batch_outputs=batch_outputs,
@@ -735,8 +752,13 @@ def train_step_and_backward_closure():
 
     @contextmanager
     def block_ddp_sync_behaviour(self):
-        if isinstance(self.trainer.model, torch.nn.parallel.DistributedDataParallel):
-            yield self.trainer.model.no_sync()
+        """
+        Blocks ddp sync gradients behaviour on backwards pass.
+        This is useful for skipping sync when accumulating gradients, reducing communication overhead
+        Returns: context manager with sync behaviour off
+        """
+        if self.trainer.accelerator_backend is not None:
+            yield self.trainer.accelerator_backend.block_ddp_plugin_sync_behaviour()
         else:
             yield