From 379697289a7517ed0222d7017d62fe80b750025f Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Tue, 29 Dec 2020 22:22:26 +0100
Subject: [PATCH 01/25] naive replace

---
 .../accelerators/accelerator_connector.py     | 36 +++++++++----------
 pytorch_lightning/trainer/deprecated_api.py   | 28 +++++++--------
 tests/deprecated_api/test_remove_1-4.py       | 25 ++++++++-----
 3 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index ce2a418cf2fa5..d0f3426402fef 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -15,7 +15,7 @@
 
 import torch
 
-from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
+from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, DeviceType, DistributedType
 from pytorch_lightning import _logger as log
 from pytorch_lightning import accelerators
 from pytorch_lightning.accelerators.accelerator import Accelerator
@@ -82,7 +82,8 @@ def on_trainer_init(
         self.trainer.sync_batchnorm = sync_batchnorm
 
         self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
-        self.trainer.on_tpu = self.trainer.tpu_cores is not None
+        if self.trainer.tpu_cores is not None:
+            self.trainer._device_type = DeviceType.TPU
 
         self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None
 
@@ -102,7 +103,8 @@ def on_trainer_init(
         self.trainer.root_gpu = device_parser.determine_root_gpu_device(self.trainer.data_parallel_device_ids)
         self.trainer.root_device = torch.device("cpu")
 
-        self.trainer.on_gpu = True if (self.trainer.data_parallel_device_ids and torch.cuda.is_available()) else False
+        if (self.trainer.data_parallel_device_ids and torch.cuda.is_available()):
+            self.trainer._device_type = DeviceType.GPU
 
         # tpu state flags
         self.trainer.use_tpu = False
@@ -115,7 +117,6 @@ def on_trainer_init(
         # override dist backend when using tpus
         if self.trainer.on_tpu:
             self.trainer.distributed_backend = "tpu"
-            self.trainer.use_tpu = True
 
         # init flags for SLURM+DDP to work
         self.trainer.world_size = 1
@@ -293,9 +294,9 @@ def set_distributed_mode(self):
                 self._set_horovod_backend()
             elif self.trainer.num_gpus == 0:
                 if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
-                    self.trainer.use_ddp = True  # ddp_cpu
+                    self.trainer._distrib_type = DistributedType.DDP
             elif self.trainer.num_gpus == 1:
-                self.trainer.use_single_gpu = True
+                self.trainer._device_type = DeviceType.GPU
             elif self.trainer.num_gpus > 1:
                 rank_zero_warn(
                     'You requested multiple GPUs but did not specify a backend, e.g.'
@@ -307,35 +308,34 @@ def set_distributed_mode(self):
         if self.trainer.distributed_backend == "dp":
             # do nothing if num_gpus == 0
             if self.trainer.num_gpus == 1:
-                self.trainer.use_single_gpu = True
-                self.trainer.use_dp = True
+                self.trainer._device_type = DeviceType.GPU
+                self.trainer._distrib_type = DistributedType.DP
             elif self.trainer.num_gpus > 1:
-                self.trainer.use_dp = True
+                self.trainer._distrib_type = DistributedType.DP
 
         elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"):
             if self.trainer.num_gpus == 0:
                 if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
-                    self.trainer.use_ddp = True  # ddp_cpu
+                    self.trainer._distrib_type = DistributedType.DDP
             elif self.trainer.num_gpus == 1:
-                self.trainer.use_single_gpu = True
-                self.trainer.use_ddp = True
+                self.trainer._device_type = DeviceType.GPU
+                self.trainer._distrib_type = DistributedType.DDP
             elif self.trainer.num_gpus > 1:
-                self.trainer.use_ddp = True
+                self.trainer._distrib_type = DistributedType.DDP
                 self.trainer.num_processes = self.trainer.num_gpus
 
         elif self.trainer.distributed_backend == "ddp2":
             # do nothing if num_gpus == 0
             if self.trainer.num_gpus >= 1:
-                self.trainer.use_ddp2 = True
+                self.trainer._distrib_type = DistributedType.DDP2
         elif self.trainer.distributed_backend == "ddp_cpu":
             if self.trainer.num_gpus > 0:
                 rank_zero_warn(
                     'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                 )
-            self.trainer.use_ddp = True
+            self.trainer._distrib_type = DistributedType.DDP
             self.trainer.data_parallel_device_ids = None
-            self.trainer.on_gpu = False
-            self.trainer.on_cpu = True
+            self.trainer.self._device_type = DeviceType.CPU
         elif self.trainer.distributed_backend == "horovod":
             self._set_horovod_backend()
 
@@ -355,7 +355,7 @@ def set_distributed_mode(self):
 
     def _set_horovod_backend(self):
         self.check_horovod()
-        self.trainer.use_horovod = True
+        self.trainer._distrib_type = DistributedType.HOROVOD
 
         # Initialize Horovod to get rank / size info
         hvd.init()
diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py
index 7b4de47a1be2c..2db6dabd8e0d3 100644
--- a/pytorch_lightning/trainer/deprecated_api.py
+++ b/pytorch_lightning/trainer/deprecated_api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pytorch_lightning.utilities import DistributedType, DeviceType
+from pytorch_lightning.utilities import DistributedType, DeviceType, rank_zero_warn
 
 
 class DeprecatedDistDeviceAttributes:
@@ -28,7 +28,7 @@ def on_cpu(self) -> bool:
 
     @on_cpu.setter
     def on_cpu(self, val: bool) -> None:
-        # rank_zero_warn("Internal: `on_cpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning)
+        rank_zero_warn("Internal: `on_cpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
             self._device_type = DeviceType.CPU
 
@@ -39,7 +39,7 @@ def on_tpu(self) -> bool:
 
     @on_tpu.setter
     def on_tpu(self, val: bool) -> None:
-        # rank_zero_warn("Internal: `on_tpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning)
+        rank_zero_warn("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         # todo add logic that it cannot be set if TPU is missing
         if val:
             self._device_type = DeviceType.TPU
@@ -51,7 +51,7 @@ def use_tpu(self) -> bool:
 
     @use_tpu.setter
     def use_tpu(self, val: bool) -> None:
-        # rank_zero_warn("Internal: `use_tpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning)
+        rank_zero_warn("Internal: `use_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         self.on_tpu = val
 
     @property
@@ -61,7 +61,7 @@ def on_gpu(self) -> bool:
 
     @on_gpu.setter
     def on_gpu(self, val: bool) -> None:
-        # rank_zero_warn("Internal: `on_gpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning)
+        rank_zero_warn("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         # todo add logic that it cannot be set if GPU is missing
         if val:
             self._device_type = DeviceType.GPU
@@ -73,7 +73,7 @@ def use_dp(self) -> bool:
 
     @use_dp.setter
     def use_dp(self, val: bool) -> None:
-        # rank_zero_warn("Internal: `use_dp` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning)
+        rank_zero_warn("Internal: `use_dp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
             self._distrib_type = DistributedType.DP
 
@@ -84,7 +84,7 @@ def use_ddp(self) -> bool:
 
     @use_ddp.setter
     def use_ddp(self, val: bool) -> None:
-        # rank_zero_warn("Internal: `use_ddp` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning)
+        rank_zero_warn("Internal: `use_ddp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
             self._distrib_type = DistributedType.DDP
 
@@ -95,7 +95,7 @@ def use_ddp2(self) -> bool:
 
     @use_ddp2.setter
     def use_ddp2(self, val: bool) -> None:
-        # rank_zero_warn("Internal: `use_ddp2` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning)
+        rank_zero_warn("Internal: `use_ddp2` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
             self._distrib_type = DistributedType.DDP2
 
@@ -108,9 +108,9 @@ def use_horovod(self) -> bool:
 
     @use_horovod.setter
     def use_horovod(self, val: bool) -> None:
-        # rank_zero_warn(
-        #     "Internal: `use_horovod` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning
-        # )
+        rank_zero_warn(
+            "Internal: `use_horovod` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning
+        )
         if val:
             self._distrib_type = DistributedType.HOROVOD
 
@@ -126,8 +126,8 @@ def use_single_gpu(self) -> bool:
 
     @use_single_gpu.setter
     def use_single_gpu(self, val: bool) -> None:
-        # rank_zero_warn(
-        #     "Internal: `use_single_gpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning,
-        # )
+        rank_zero_warn(
+            "Internal: `use_single_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning,
+        )
         if val:
             self._device_type = DeviceType.GPU
diff --git a/tests/deprecated_api/test_remove_1-4.py b/tests/deprecated_api/test_remove_1-4.py
index 9a7a970aecaf7..2d2d59be0f797 100644
--- a/tests/deprecated_api/test_remove_1-4.py
+++ b/tests/deprecated_api/test_remove_1-4.py
@@ -37,35 +37,42 @@ def test_v1_4_0_deprecated_imports():
         from pytorch_lightning.utilities.xla_device_utils import XLADeviceUtils  # noqa: F811 F401
 
 
-# todo: later add also checking deprecated warnings
 def test_v1_4_0_deprecated_trainer_attributes():
     """Test that Trainer attributes works fine."""
     trainer = Trainer()
     trainer._distrib_type = None
     trainer._device_type = None
 
-    trainer.on_cpu = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.on_cpu = True
     assert trainer.on_cpu
 
-    trainer.on_gpu = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.on_gpu = True
     assert trainer.on_gpu
 
-    trainer.on_tpu = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.on_tpu = True
     assert trainer.on_tpu
     trainer._device_type = None
-    trainer.use_tpu = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.use_tpu = True
     assert trainer.use_tpu
 
-    trainer.use_dp = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.use_dp = True
     assert trainer.use_dp
 
-    trainer.use_ddp = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.use_ddp = True
     assert trainer.use_ddp
 
-    trainer.use_ddp2 = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.use_ddp2 = True
     assert trainer.use_ddp2
 
-    trainer.use_horovod = True
+    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
+        trainer.use_horovod = True
     assert trainer.use_horovod
 
 

From 0cbc4cc9fc207ea3b07b1169c107ae3ff2255fec Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Tue, 29 Dec 2020 23:03:17 +0100
Subject: [PATCH 02/25] simplify

---
 .../accelerators/accelerator_connector.py     | 60 ++++++++-----------
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index d0f3426402fef..9b5e6bfc9d0dc 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -283,11 +283,6 @@ def select_accelerator(self):
         return accelerator_backend
 
     def set_distributed_mode(self):
-        self.trainer.use_dp = False
-        self.trainer.use_ddp = False
-        self.trainer.use_ddp2 = False
-        self.trainer.use_horovod = False
-        self.trainer.use_single_gpu = False
 
         if self.trainer.distributed_backend is None:
             if self.has_horovodrun():
@@ -305,42 +300,37 @@ def set_distributed_mode(self):
                 )
                 self.trainer.distributed_backend = "ddp_spawn"
 
-        if self.trainer.distributed_backend == "dp":
-            # do nothing if num_gpus == 0
-            if self.trainer.num_gpus == 1:
-                self.trainer._device_type = DeviceType.GPU
-                self.trainer._distrib_type = DistributedType.DP
-            elif self.trainer.num_gpus > 1:
-                self.trainer._distrib_type = DistributedType.DP
-
-        elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"):
-            if self.trainer.num_gpus == 0:
-                if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
-                    self.trainer._distrib_type = DistributedType.DDP
-            elif self.trainer.num_gpus == 1:
-                self.trainer._device_type = DeviceType.GPU
-                self.trainer._distrib_type = DistributedType.DDP
-            elif self.trainer.num_gpus > 1:
-                self.trainer._distrib_type = DistributedType.DDP
-                self.trainer.num_processes = self.trainer.num_gpus
-
-        elif self.trainer.distributed_backend == "ddp2":
-            # do nothing if num_gpus == 0
-            if self.trainer.num_gpus >= 1:
-                self.trainer._distrib_type = DistributedType.DDP2
-        elif self.trainer.distributed_backend == "ddp_cpu":
+        if self.trainer.distributed_backend == "ddp_cpu":
+            self.trainer._distrib_type = DistributedType.DDP
+            self.trainer.data_parallel_device_ids = None
             if self.trainer.num_gpus > 0:
                 rank_zero_warn(
                     'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                 )
-            self.trainer._distrib_type = DistributedType.DDP
-            self.trainer.data_parallel_device_ids = None
-            self.trainer.self._device_type = DeviceType.CPU
-        elif self.trainer.distributed_backend == "horovod":
+        else:
+            self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend)
+
+        if self.trainer.num_gpus > 0 and 'cpu' not in self.trainer.distributed_backend:
+            self.trainer._device_type = DeviceType.GPU
+
+        if self.trainer.num_gpus == 0 and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2):
+            rank_zero_warn(
+                'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
+            )
+            if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
+                self.trainer._distrib_type = DistributedType.DDP
+            else:
+                rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
+                self.trainer._distrib_type = None
+
+        if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN):
+            self.trainer.num_processes = self.trainer.num_gpus
+
+        if self.trainer.distributed_backend == "horovod":
             self._set_horovod_backend()
 
         # throw error to force user ddp or ddp2 choice
-        if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2 or self.trainer.use_ddp):
+        if self.trainer.num_nodes > 1 and self.trainer._distrib_type not in (DistributedType.DDP2, DistributedType.DDP):
             raise MisconfigurationException(
                 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
                 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
@@ -350,7 +340,7 @@ def set_distributed_mode(self):
         num_cores = self.trainer.tpu_cores if self.trainer.tpu_cores is not None else 0
         rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores')
 
-        if torch.cuda.is_available() and not self.trainer.on_gpu:
+        if torch.cuda.is_available() and self.trainer._device_type != DeviceType.GPU:
             rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.')
 
     def _set_horovod_backend(self):

From 9b3c68d595c55f9dd0e3034b7942f6d17296faa4 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Tue, 29 Dec 2020 23:12:06 +0100
Subject: [PATCH 03/25] clean

---
 .../accelerators/accelerator_connector.py     | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 9b5e6bfc9d0dc..167a7df01cfe1 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -83,7 +83,12 @@ def on_trainer_init(
 
         self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
         if self.trainer.tpu_cores is not None:
-            self.trainer._device_type = DeviceType.TPU
+            if _TPU_AVAILABLE:
+                self.trainer._device_type = DeviceType.TPU
+            else:
+                raise MisconfigurationException(
+                    f"You have requested {self.trainer.tpu_cores} TPU cores but none is available."
+                )
 
         self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None
 
@@ -101,20 +106,15 @@ def on_trainer_init(
 
         self.trainer.data_parallel_device_ids = device_parser.parse_gpu_ids(self.trainer.gpus)
         self.trainer.root_gpu = device_parser.determine_root_gpu_device(self.trainer.data_parallel_device_ids)
-        self.trainer.root_device = torch.device("cpu")
-
-        if (self.trainer.data_parallel_device_ids and torch.cuda.is_available()):
-            self.trainer._device_type = DeviceType.GPU
 
         # tpu state flags
-        self.trainer.use_tpu = False
         self.trainer.tpu_local_core_rank = None
         self.trainer.tpu_global_core_rank = None
 
         # distributed backend choice
         self.set_distributed_mode()
 
-        # override dist backend when using tpus
+        # override dist backend when using TPUs
         if self.trainer.on_tpu:
             self.trainer.distributed_backend = "tpu"
 
@@ -138,8 +138,10 @@ def on_trainer_init(
 
     def _map_deprecated_dist_backend(self, accelerator, distributed_backend):
         if distributed_backend is not None:
-            rank_zero_warn(DeprecationWarning('distributed_backend has been renamed to accelerator. '
-                                              'Deprecated in 1.0.0, will be removed in 1.2.0'))
+            rank_zero_warn(
+                '`distributed_backend` has been renamed to accelerator. Deprecated in 1.0.0, will be removed in 1.2.0',
+                DeprecationWarning
+            )
 
         # temporary mapping until we remove all the distributed_backend references
         if accelerator is not None:
@@ -344,7 +346,7 @@ def set_distributed_mode(self):
             rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.')
 
     def _set_horovod_backend(self):
-        self.check_horovod()
+        self._check_horovod()
         self.trainer._distrib_type = DistributedType.HOROVOD
 
         # Initialize Horovod to get rank / size info
@@ -353,7 +355,7 @@ def _set_horovod_backend(self):
             # Horovod assigns one local GPU per process
             self.trainer.root_gpu = hvd.local_rank()
 
-    def check_horovod(self):
+    def _check_horovod(self):
         """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod."""
         if not _HOROVOD_AVAILABLE:
             raise MisconfigurationException(

From 71a81b1bd44cc54f5b55fbc1455e10acfbab6737 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Tue, 29 Dec 2020 23:23:41 +0100
Subject: [PATCH 04/25] .

---
 pytorch_lightning/accelerators/accelerator_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 167a7df01cfe1..fc97bdbe7b7f2 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -309,7 +309,7 @@ def set_distributed_mode(self):
                 rank_zero_warn(
                     'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                 )
-        else:
+        elif self.trainer.distributed_backend:
             self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend)
 
         if self.trainer.num_gpus > 0 and 'cpu' not in self.trainer.distributed_backend:

From 1c43e5ee5edfa740f7041913b4b64f1599b302dc Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Wed, 30 Dec 2020 00:40:57 +0100
Subject: [PATCH 05/25] fix

---
 .../accelerators/accelerator_connector.py     | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index fc97bdbe7b7f2..d58c42bc26c4d 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -289,19 +289,16 @@ def set_distributed_mode(self):
         if self.trainer.distributed_backend is None:
             if self.has_horovodrun():
                 self._set_horovod_backend()
-            elif self.trainer.num_gpus == 0:
-                if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
-                    self.trainer._distrib_type = DistributedType.DDP
-            elif self.trainer.num_gpus == 1:
-                self.trainer._device_type = DeviceType.GPU
+            elif self.trainer.num_gpus == 0 and (self.trainer.num_nodes > 1 or self.trainer.num_processes > 1):
+                self.trainer._distrib_type = DistributedType.DDP
             elif self.trainer.num_gpus > 1:
                 rank_zero_warn(
                     'You requested multiple GPUs but did not specify a backend, e.g.'
-                    ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`.'
-                    ' Setting `accelerator="ddp_spawn"` for you.'
+                    ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.'
                 )
                 self.trainer.distributed_backend = "ddp_spawn"
 
+        # special case with DDP on CPUs
         if self.trainer.distributed_backend == "ddp_cpu":
             self.trainer._distrib_type = DistributedType.DDP
             self.trainer.data_parallel_device_ids = None
@@ -309,13 +306,18 @@ def set_distributed_mode(self):
                 rank_zero_warn(
                     'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                 )
-        elif self.trainer.distributed_backend:
+        # set all other requested distrib. types adn if it was not set in the
+        elif self.trainer.distributed_backend and self.trainer._distrib_type is None:
             self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend)
 
-        if self.trainer.num_gpus > 0 and 'cpu' not in self.trainer.distributed_backend:
+        # unless you request explicitly for CPU and some GPU are available use them
+        if (self.trainer.num_gpus > 0
+                and not (self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend)):
             self.trainer._device_type = DeviceType.GPU
 
-        if self.trainer.num_gpus == 0 and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2):
+        # DP and DDP2 cannot run without GPU
+        if (self.trainer.num_gpus == 0
+                and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP, DistributedType.DDP2)):
             rank_zero_warn(
                 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
             )
@@ -325,9 +327,12 @@ def set_distributed_mode(self):
                 rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
                 self.trainer._distrib_type = None
 
-        if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN):
+        # for DDP overwrite nb processes by requested GPUs
+        if (self.trainer._device_type == DeviceType.GPU
+                and self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)):
             self.trainer.num_processes = self.trainer.num_gpus
 
+        # Horovod si an extra case...
         if self.trainer.distributed_backend == "horovod":
             self._set_horovod_backend()
 

From 64c73d57e7d3f257f839a797cf4241ed8ad0116c Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Wed, 30 Dec 2020 00:55:15 +0100
Subject: [PATCH 06/25] .

---
 pytorch_lightning/trainer/deprecated_api.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py
index 2db6dabd8e0d3..2c8377d2936c9 100644
--- a/pytorch_lightning/trainer/deprecated_api.py
+++ b/pytorch_lightning/trainer/deprecated_api.py
@@ -40,7 +40,6 @@ def on_tpu(self) -> bool:
     @on_tpu.setter
     def on_tpu(self, val: bool) -> None:
         rank_zero_warn("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        # todo add logic that it cannot be set if TPU is missing
         if val:
             self._device_type = DeviceType.TPU
 
@@ -62,7 +61,6 @@ def on_gpu(self) -> bool:
     @on_gpu.setter
     def on_gpu(self, val: bool) -> None:
         rank_zero_warn("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        # todo add logic that it cannot be set if GPU is missing
         if val:
             self._device_type = DeviceType.GPU
 

From 7af6832f69a75bf5a94b7770baa5e8ebe7723fea Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Wed, 30 Dec 2020 19:25:37 +0100
Subject: [PATCH 07/25] fix

---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index d58c42bc26c4d..2db6f1c2bdbde 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -321,7 +321,9 @@ def set_distributed_mode(self):
             rank_zero_warn(
                 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
             )
-            if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1:
+            # in some cases it yield in comarison None and int
+            if ((self.trainer.num_nodes and self.trainer.num_nodes > 1)
+                    or (self.trainer.num_processes and self.trainer.num_processes > 1)):
                 self.trainer._distrib_type = DistributedType.DDP
             else:
                 rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')

From ff748549f60f37711324ec3e2998635d84d415c1 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Wed, 30 Dec 2020 19:47:39 +0100
Subject: [PATCH 08/25] fix

---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 2db6f1c2bdbde..8700a6d5f6b31 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -317,11 +317,11 @@ def set_distributed_mode(self):
 
         # DP and DDP2 cannot run without GPU
         if (self.trainer.num_gpus == 0
-                and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP, DistributedType.DDP2)):
+                and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2)):
             rank_zero_warn(
                 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
             )
-            # in some cases it yield in comarison None and int
+            # todo: in some cases it yield in comarison None and int
             if ((self.trainer.num_nodes and self.trainer.num_nodes > 1)
                     or (self.trainer.num_processes and self.trainer.num_processes > 1)):
                 self.trainer._distrib_type = DistributedType.DDP

From b3c8f96d8e1a6381f6e332133e724b2b863fa8bb Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Thu, 31 Dec 2020 11:46:47 +0100
Subject: [PATCH 09/25] fix

---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 8700a6d5f6b31..8cd579a22e926 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -317,7 +317,9 @@ def set_distributed_mode(self):
 
         # DP and DDP2 cannot run without GPU
         if (self.trainer.num_gpus == 0
-                and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2)):
+                and self.trainer._distrib_type in (
+                        DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2
+                )):
             rank_zero_warn(
                 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
             )

From f55383bbe572916f9d415a58fb35f2b3b94e0873 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Thu, 31 Dec 2020 11:51:21 +0100
Subject: [PATCH 10/25] flake8

---
 pytorch_lightning/accelerators/accelerator_connector.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 8cd579a22e926..fea3df06b7df4 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -315,11 +315,9 @@ def set_distributed_mode(self):
                 and not (self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend)):
             self.trainer._device_type = DeviceType.GPU
 
+        _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
         # DP and DDP2 cannot run without GPU
-        if (self.trainer.num_gpus == 0
-                and self.trainer._distrib_type in (
-                        DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2
-                )):
+        if (self.trainer.num_gpus == 0 and self.trainer._distrib_type in _distrib_types):
             rank_zero_warn(
                 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
             )

From 60240a624734ff7af7654fbc413041c9f9f2da3d Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 10:20:15 +0100
Subject: [PATCH 11/25] text

---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index fea3df06b7df4..4e736a20856e9 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -279,7 +279,9 @@ def select_accelerator(self):
             accelerator_backend = accelerators.CPUAccelerator(self.trainer, cluster_env)
         else:
             raise MisconfigurationException(
-                f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend'
+                f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend for'
+                f' num_nodes={self.trainer.num_nodes},  num_gpus={self.trainer.num_gpus}'
+                f' and  num_processes={self.trainer.num_processes}.'
             )
 
         return accelerator_backend

From a70616e1d65dd421069123468fc233b8177cc12a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 14:35:41 +0100
Subject: [PATCH 12/25] 2

---
 tests/backends/test_accelerator_connector.py | 10 +++++-----
 tests/checkpointing/test_torch_saving.py     |  3 +--
 tests/plugins/test_amp_plugin.py             | 12 ++++++++----
 tests/plugins/test_apex_plugin.py            | 12 ++++++++----
 tests/plugins/test_ddp_plugin.py             | 12 ++++++------
 tests/plugins/test_plugin.py                 |  4 ++--
 tests/plugins/test_rpc_plugin.py             |  2 +-
 tests/plugins/test_sharded_plugin.py         |  4 ++--
 8 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index b9b4263d0cf50..56f69c7970f15 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -242,7 +242,7 @@ def on_fit_start(self, trainer, pl_module):
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_cpu',
-        num_processes=1,
+        num_processes=2,
         callbacks=[CB()],
     )
 
@@ -270,7 +270,7 @@ def on_fit_start(self, trainer, pl_module):
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_cpu',
-        num_processes=1,
+        num_processes=2,
         callbacks=[CB()],
     )
 
@@ -307,7 +307,7 @@ def on_fit_start(self, trainer, pl_module):
         plugins=[CustomCluster()],
         fast_dev_run=True,
         accelerator='ddp_cpu',
-        num_processes=1,
+        num_processes=2,
         callbacks=[CB()],
     )
 
@@ -341,7 +341,7 @@ def on_fit_start(self, trainer, pl_module):
     trainer = Trainer(
         fast_dev_run=True,
         accelerator=Accel(),
-        num_processes=1,
+        num_processes=2,
         callbacks=[CB()]
     )
 
@@ -367,7 +367,7 @@ def on_fit_start(self, trainer, pl_module):
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_cpu',
-        num_processes=1,
+        num_processes=2,
         callbacks=[CB()]
     )
 
diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py
index 493aa0dabe126..a15d425f5a0e7 100644
--- a/tests/checkpointing/test_torch_saving.py
+++ b/tests/checkpointing/test_torch_saving.py
@@ -43,8 +43,7 @@ def test_model_torch_save(tmpdir, enable_pl_optimizer):
     assert is_lightning_optimizer if enable_pl_optimizer else not is_lightning_optimizer
 
 
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 def test_model_torch_save_ddp_cpu(tmpdir):
     """Test to ensure torch save does not fail for model and trainer using cpu ddp."""
     model = BoringModel()
diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py
index 6c5a7b052d0d1..1e98740f99d62 100644
--- a/tests/plugins/test_amp_plugin.py
+++ b/tests/plugins/test_amp_plugin.py
@@ -21,8 +21,10 @@
     "SLURM_LOCALID": "0"
 })
 @mock.patch('torch.cuda.device_count', return_value=2)
-@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],
-                         [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])
+@pytest.mark.parametrize(
+    ['ddp_backend', 'gpus', 'num_processes'],
+    [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
+)
 def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
 
     class CB(Callback):
@@ -55,8 +57,10 @@ def on_fit_start(self, trainer, pl_module):
     "SLURM_LOCALID": "0"
 })
 @mock.patch('torch.cuda.device_count', return_value=2)
-@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],
-                         [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])
+@pytest.mark.parametrize(
+    ['ddp_backend', 'gpus', 'num_processes'],
+    [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
+)
 def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
     class MyNativeAMP(NativeAMPPlugin):
         pass
diff --git a/tests/plugins/test_apex_plugin.py b/tests/plugins/test_apex_plugin.py
index bfed1aefec0a1..c4198b97446c3 100644
--- a/tests/plugins/test_apex_plugin.py
+++ b/tests/plugins/test_apex_plugin.py
@@ -18,8 +18,10 @@
     "SLURM_LOCALID": "0"
 })
 @mock.patch('torch.cuda.device_count', return_value=2)
-@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],
-                         [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])
+@pytest.mark.parametrize(
+    ['ddp_backend', 'gpus', 'num_processes'],
+    [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
+)
 def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
 
     class CB(Callback):
@@ -52,8 +54,10 @@ def on_fit_start(self, trainer, pl_module):
     "SLURM_LOCALID": "0"
 })
 @mock.patch('torch.cuda.device_count', return_value=2)
-@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'],
-                         [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)])
+@pytest.mark.parametrize(
+    ['ddp_backend', 'gpus', 'num_processes'],
+    [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
+)
 def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
     class MyApexPlugin(ApexPlugin):
         pass
diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py
index 4e51fc7c5ac21..fe8fc555ba06c 100644
--- a/tests/plugins/test_ddp_plugin.py
+++ b/tests/plugins/test_ddp_plugin.py
@@ -27,7 +27,7 @@
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
     class CB(Callback):
@@ -62,7 +62,7 @@ def on_fit_start(self, trainer, pl_module):
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 def test_ddp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
     class MyDDP(DDPPlugin):
@@ -101,7 +101,7 @@ def on_fit_start(self, trainer, pl_module):
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed sharded plugin is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
@@ -139,7 +139,7 @@ def on_fit_start(self, trainer, pl_module):
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 def test_ddp_invalid_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
     with pytest.raises(MisconfigurationException, match='not a supported lightning custom plugin'):
@@ -166,7 +166,7 @@ def test_ddp_invalid_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_proces
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed sharded plugin is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
@@ -202,7 +202,7 @@ class MyDDP(DDPPlugin):
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 def test_ddp_choice_custom_ddp_cpu_custom_args(
         tmpdir, ddp_backend, gpus, num_processes
diff --git a/tests/plugins/test_plugin.py b/tests/plugins/test_plugin.py
index be9d95f09f03f..05789596879b4 100644
--- a/tests/plugins/test_plugin.py
+++ b/tests/plugins/test_plugin.py
@@ -38,7 +38,7 @@
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 def test_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes):
     """
@@ -92,7 +92,7 @@ def on_fit_start(self, trainer, pl_module):
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 def test_invalid_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes):
     """
diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py
index 87d64a7b8c686..a28cd4b50e4f4 100644
--- a/tests/plugins/test_rpc_plugin.py
+++ b/tests/plugins/test_rpc_plugin.py
@@ -26,7 +26,7 @@
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 @pytest.mark.skipif(not _RPC_AVAILABLE, reason="RPC is not available")
 def test_rpc_choice(tmpdir, ddp_backend, gpus, num_processes):
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index c0761b7e03fcb..b4a09760bc31c 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -28,7 +28,7 @@
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes):
@@ -89,7 +89,7 @@ def test_invalid_apex_sharded(tmpdir):
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
 )
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")

From 7780f3f7eb29f1280dca68d64de948a2db95f12b Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 14:47:05 +0100
Subject: [PATCH 13/25] max

---
 .../accelerators/accelerator_connector.py     | 43 ++++++++++---------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 4e736a20856e9..804810140bf79 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -81,16 +81,7 @@ def on_trainer_init(
         # sync-bn backend
         self.trainer.sync_batchnorm = sync_batchnorm
 
-        self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
-        if self.trainer.tpu_cores is not None:
-            if _TPU_AVAILABLE:
-                self.trainer._device_type = DeviceType.TPU
-            else:
-                raise MisconfigurationException(
-                    f"You have requested {self.trainer.tpu_cores} TPU cores but none is available."
-                )
-
-        self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None
+        self._parse_tpu_device_details(tpu_cores)
 
         if num_processes != 1 and distributed_backend != "ddp_cpu":
             rank_zero_warn("num_processes is only used for `accelerator='ddp_cpu'`. Ignoring it.")
@@ -107,17 +98,9 @@ def on_trainer_init(
         self.trainer.data_parallel_device_ids = device_parser.parse_gpu_ids(self.trainer.gpus)
         self.trainer.root_gpu = device_parser.determine_root_gpu_device(self.trainer.data_parallel_device_ids)
 
-        # tpu state flags
-        self.trainer.tpu_local_core_rank = None
-        self.trainer.tpu_global_core_rank = None
-
         # distributed backend choice
         self.set_distributed_mode()
 
-        # override dist backend when using TPUs
-        if self.trainer.on_tpu:
-            self.trainer.distributed_backend = "tpu"
-
         # init flags for SLURM+DDP to work
         self.trainer.world_size = 1
         self.trainer.interactive_ddp_procs = []
@@ -136,6 +119,23 @@ def on_trainer_init(
 
         self.trainer.replace_sampler_ddp = replace_sampler_ddp
 
+    def _parse_tpu_device_details(self, tpu_cores):
+        self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
+        if self.trainer.tpu_cores is not None:
+            if _TPU_AVAILABLE:
+                self.trainer._device_type = DeviceType.TPU
+                self.trainer.distributed_backend = "tpu"
+            else:
+                raise MisconfigurationException(
+                    f"You have requested {self.trainer.tpu_cores} TPU cores but none is available."
+                )
+
+        self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None
+
+        # tpu state flags
+        self.trainer.tpu_local_core_rank = None
+        self.trainer.tpu_global_core_rank = None
+
     def _map_deprecated_dist_backend(self, accelerator, distributed_backend):
         if distributed_backend is not None:
             rank_zero_warn(
@@ -313,9 +313,12 @@ def set_distributed_mode(self):
             self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend)
 
         # unless you request explicitly for CPU and some GPU are available use them
-        if (self.trainer.num_gpus > 0
-                and not (self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend)):
+        _on_cpu = self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend
+        if (self.trainer.num_gpus > 0 and not _on_cpu):
             self.trainer._device_type = DeviceType.GPU
+        elif self.trainer._device_type == DeviceType.CPU and self.trainer.num_processes is None:
+            # define the max CPU available
+            self.trainer.num_processes = os.cpu_count()
 
         _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
         # DP and DDP2 cannot run without GPU

From 4a11e1ad427052b19a8c9b63bf5da2195af3c08c Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 14:55:57 +0100
Subject: [PATCH 14/25] max

---
 pytorch_lightning/accelerators/accelerator_connector.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 804810140bf79..e0b155e62b904 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -308,6 +308,10 @@ def set_distributed_mode(self):
                 rank_zero_warn(
                     'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                 )
+            if self.trainer.num_processes is None:
+                # define the max CPU available
+                self.trainer.num_processes = os.cpu_count()
+
         # set all other requested distrib. types adn if it was not set in the
         elif self.trainer.distributed_backend and self.trainer._distrib_type is None:
             self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend)
@@ -316,9 +320,6 @@ def set_distributed_mode(self):
         _on_cpu = self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend
         if (self.trainer.num_gpus > 0 and not _on_cpu):
             self.trainer._device_type = DeviceType.GPU
-        elif self.trainer._device_type == DeviceType.CPU and self.trainer.num_processes is None:
-            # define the max CPU available
-            self.trainer.num_processes = os.cpu_count()
 
         _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
         # DP and DDP2 cannot run without GPU

From 5d3a550a58c1d767ca9f972cbff4d2d726d1d6c0 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 15:09:15 +0100
Subject: [PATCH 15/25] 2

---
 tests/backends/test_accelerator_connector.py | 1 +
 tests/plugins/test_sharded_plugin.py         | 5 +++++
 tests/trainer/properties/test_get_model.py   | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index 56f69c7970f15..5ca29d27d292e 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -50,6 +50,7 @@ def on_fit_start(self, trainer, pl_module):
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_cpu',
+        num_processes=2,
         callbacks=[CB()],
     )
 
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index b4a09760bc31c..d8334e24e0e83 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -129,6 +129,7 @@ def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         accelerator='ddp_cpu',
+        num_processes=2,
         plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
@@ -208,6 +209,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         accelerator='ddp_cpu',
+        num_processes=2,
         plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
@@ -221,6 +223,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
 
     trainer = Trainer(
         accelerator='ddp_cpu',
+        num_processes=2,
         plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
         resume_from_checkpoint=checkpoint_path
@@ -291,6 +294,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
     trainer = Trainer(
         plugins=[DDPShardedPlugin()],
         accelerator='ddp_cpu',
+        num_processes=2,
         fast_dev_run=True,
         resume_from_checkpoint=checkpoint_path
     )
@@ -308,6 +312,7 @@ def test_ddp_sharded_plugin_test(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         accelerator='ddp_cpu',
+        num_processes=2,
         plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py
index ca1301fb0dec6..16434f390b90a 100644
--- a/tests/trainer/properties/test_get_model.py
+++ b/tests/trainer/properties/test_get_model.py
@@ -61,7 +61,7 @@ def test_get_model_ddp_cpu(tmpdir):
         limit_val_batches=2,
         max_epochs=1,
         accelerator='ddp_cpu',
-        num_processes=2
+        num_processes=2,
     )
     trainer.fit(model)
 

From 0beae5754363d9ca1bdfb382c725913bf24e13cd Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 16:14:38 +0100
Subject: [PATCH 16/25] 2

---
 pytorch_lightning/plugins/plugin_connector.py |  4 ++--
 tests/backends/test_accelerator_connector.py  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/plugin_connector.py
index d66c25173cc77..596a630b1c959 100644
--- a/pytorch_lightning/plugins/plugin_connector.py
+++ b/pytorch_lightning/plugins/plugin_connector.py
@@ -31,8 +31,8 @@ def __init__(self, trainer):
         self.plugins = []
         self.ddp_plugin = DDPPlugin()
         self.cloud_environment = None
-        self.amp_plugin = NativeAMPPlugin(trainer)
-        self.apex_plugin = ApexPlugin(trainer)
+        # self.amp_plugin = NativeAMPPlugin(trainer)
+        # self.apex_plugin = ApexPlugin(trainer)
 
     def on_trainer_init(self, plugins: Optional[Union[str, list]]):
         self.plugins = plugins
diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index 5ca29d27d292e..dc8bf338d3eb3 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -252,7 +252,7 @@ def on_fit_start(self, trainer, pl_module):
 
 
 @mock.patch.dict(os.environ, {
-    "SLURM_NTASKS": "1",
+    "SLURM_NTASKS": "2",
     "SLURM_JOB_NAME": "SOME_NAME",
     "SLURM_NODEID": "0",
     "LOCAL_RANK": "0",
@@ -280,7 +280,7 @@ def on_fit_start(self, trainer, pl_module):
 
 
 @mock.patch.dict(os.environ, {
-    "SLURM_NTASKS": "1",
+    "SLURM_NTASKS": "2",
     "SLURM_JOB_NAME": "SOME_NAME",
     "SLURM_NODEID": "0",
     "LOCAL_RANK": "0",
@@ -317,7 +317,7 @@ def on_fit_start(self, trainer, pl_module):
 
 
 @mock.patch.dict(os.environ, {
-    "SLURM_NTASKS": "1",
+    "SLURM_NTASKS": "2",
     "SLURM_JOB_NAME": "SOME_NAME",
     "SLURM_NODEID": "0",
     "LOCAL_RANK": "0",
@@ -351,7 +351,7 @@ def on_fit_start(self, trainer, pl_module):
 
 
 @mock.patch.dict(os.environ, {
-    "SLURM_NTASKS": "1",
+    "SLURM_NTASKS": "2",
     "SLURM_JOB_NAME": "SOME_NAME",
     "SLURM_NODEID": "0",
     "LOCAL_RANK": "0",
@@ -369,7 +369,7 @@ def on_fit_start(self, trainer, pl_module):
         fast_dev_run=True,
         accelerator='ddp_cpu',
         num_processes=2,
-        callbacks=[CB()]
+        callbacks=[CB()],
     )
 
     with pytest.raises(SystemExit):

From 0e99cb99236c49a6b8ce45ddd619ff0dc6cb57a4 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 16:53:51 +0100
Subject: [PATCH 17/25] tpu

---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index e0b155e62b904..bda4cd05383f2 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -311,7 +311,9 @@ def set_distributed_mode(self):
             if self.trainer.num_processes is None:
                 # define the max CPU available
                 self.trainer.num_processes = os.cpu_count()
-
+        # special case with TPUs
+        elif self.trainer.distributed_backend == 'tpu':
+            self.trainer._device_type = DeviceType.TPU
         # set all other requested distrib. types adn if it was not set in the
         elif self.trainer.distributed_backend and self.trainer._distrib_type is None:
             self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend)

From ed634353a48a5e738e739678349e554fd5b336ae Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 17:01:53 +0100
Subject: [PATCH 18/25] 2

---
 benchmarks/test_sharded_parity.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index bd7f335a03720..2616f5a11d7f2 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -1,12 +1,12 @@
 import os
 import platform
 import time
-from typing import Union
+from typing import Union, Type
 
 import pytest
 import torch
 
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import Trainer, seed_everything, LightningModule
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
@@ -20,9 +20,10 @@
 def test_ddp_sharded_plugin_correctness_one_device():
     plugin_parity_test(
         accelerator='ddp_cpu',
+        num_processes=2,
         max_percent_speed_diff=0.15,  # slower speed due to one CPU doing additional sequential memory saving calls
         plugin=DDPShardedPlugin(),
-        model_cls=SeedTrainLoaderModel
+        model_cls=SeedTrainLoaderModel,
     )
 
 
@@ -35,7 +36,7 @@ def test_ddp_sharded_plugin_correctness_one_gpu():
         gpus=1,
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
-        model_cls=SeedTrainLoaderModel
+        model_cls=SeedTrainLoaderModel,
     )
 
 
@@ -50,7 +51,7 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu():
         precision=16,
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
-        model_cls=SeedTrainLoaderModel
+        model_cls=SeedTrainLoaderModel,
     )
 
 
@@ -65,7 +66,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu():
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25
+        max_percent_speed_diff=0.25,
     )
 
 
@@ -81,7 +82,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25
+        max_percent_speed_diff=0.25,
     )
 
 
@@ -97,7 +98,7 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
         accelerator='ddp_spawn',
         plugin='ddp_sharded',
         model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25
+        max_percent_speed_diff=0.25,
     )
 
 
@@ -145,7 +146,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
         gpus=2,
         accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderMultipleOptimizersModel,
-        max_percent_speed_diff=0.25  # Increase speed diff since only 2 GPUs sharding 2 optimizers
+        max_percent_speed_diff=0.25,  # Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
 
 
@@ -163,7 +164,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
         gpus=2,
         accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderManualModel,
-        max_percent_speed_diff=0.25  # Increase speed diff since only 2 GPUs sharding 2 optimizers
+        max_percent_speed_diff=0.25,  # Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
 
 
@@ -259,13 +260,15 @@ def record_ddp_fit_model_stats(trainer, model, use_cuda):
 
 
 def plugin_parity_test(
-        model_cls: SeedTrainLoaderModel,
+        model_cls: Type[SeedTrainLoaderModel],
         plugin: Union[str, DDPPlugin],
         seed: int = 42,
         accelerator: str = 'ddp_spawn',
         gpus: int = 0,
         precision: int = 32,
-        max_percent_speed_diff: float = 0.1):
+        max_percent_speed_diff: float = 0.1,
+        **kwargs,
+):
     """
     Ensures that the trained model is identical to the standard DDP implementation.
     Also checks for speed/memory regressions, we should expect always less memory but performance to fluctuate.
@@ -293,6 +296,7 @@ def plugin_parity_test(
         gpus=gpus,
         precision=precision,
         accelerator=accelerator,
+        **kwargs,
     )
 
     max_memory_ddp, ddp_time = record_ddp_fit_model_stats(

From 8fc771148d3ccfaa493ffe8a7ac69c55c49629b6 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 17:05:22 +0100
Subject: [PATCH 19/25] flake8

---
 benchmarks/test_sharded_parity.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 2616f5a11d7f2..5b0445c5b9b80 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -1,12 +1,12 @@
 import os
 import platform
 import time
-from typing import Union, Type
+from typing import Type, Union
 
 import pytest
 import torch
 
-from pytorch_lightning import Trainer, seed_everything, LightningModule
+from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE

From fc26e76004ead251acb0cb83e8da82f5851ec99a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 18:11:55 +0100
Subject: [PATCH 20/25] .

---
 benchmarks/test_sharded_parity.py                       | 1 +
 pytorch_lightning/accelerators/accelerator_connector.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 5b0445c5b9b80..cff3a0c0ecdb9 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -316,6 +316,7 @@ def plugin_parity_test(
         precision=precision,
         accelerator=accelerator,
         plugins=[plugin],
+        **kwargs,
     )
 
     max_memory_custom, custom_model_time = record_ddp_fit_model_stats(
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index bda4cd05383f2..7417f889dd808 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -279,9 +279,9 @@ def select_accelerator(self):
             accelerator_backend = accelerators.CPUAccelerator(self.trainer, cluster_env)
         else:
             raise MisconfigurationException(
-                f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend for'
-                f' num_nodes={self.trainer.num_nodes},  num_gpus={self.trainer.num_gpus}'
-                f' and  num_processes={self.trainer.num_processes}.'
+                f'`Trainer(accelerator={self.trainer.distributed_backend}, num_nodes={self.trainer.num_nodes},'
+                f' num_processes={self.trainer.num_processes}, ...)` is not a supported backend for'
+                f' num_gpus={self.trainer.num_gpus}'
             )
 
         return accelerator_backend

From cbd05810c0d5fa39e1596d01017e0994ea9eaa4f Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 1 Jan 2021 19:18:33 +0100
Subject: [PATCH 21/25] .

---
 benchmarks/test_sharded_parity.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index cff3a0c0ecdb9..9688a603451f1 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -14,8 +14,7 @@
 from tests.base.boring_model import BoringModel, RandomDataset
 
 
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_one_device():
     plugin_parity_test(
@@ -28,8 +27,7 @@ def test_ddp_sharded_plugin_correctness_one_device():
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_one_gpu():
     plugin_parity_test(
@@ -42,8 +40,7 @@ def test_ddp_sharded_plugin_correctness_one_gpu():
 
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_amp_one_gpu():
     plugin_parity_test(
@@ -57,8 +54,7 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu():
 
 @pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_multi_gpu():
     plugin_parity_test(
@@ -71,8 +67,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu():
 
 
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
@@ -87,8 +82,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
 
 
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
@@ -134,8 +128,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
 
 @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
     """
@@ -152,8 +145,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
 
 @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
     """

From 2f6f608156f3bf705c6c44354cc19cf6d348629f Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Sat, 2 Jan 2021 16:39:09 +0100
Subject: [PATCH 22/25] . @SeanNaren

---
 benchmarks/test_sharded_parity.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 9688a603451f1..1a9aed58b97c4 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -14,18 +14,6 @@
 from tests.base.boring_model import BoringModel, RandomDataset
 
 
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-def test_ddp_sharded_plugin_correctness_one_device():
-    plugin_parity_test(
-        accelerator='ddp_cpu',
-        num_processes=2,
-        max_percent_speed_diff=0.15,  # slower speed due to one CPU doing additional sequential memory saving calls
-        plugin=DDPShardedPlugin(),
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")

From 233515cc5880ff838d082336a2189b9267d659b2 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Sun, 3 Jan 2021 01:42:48 +0100
Subject: [PATCH 23/25] chlog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 57105e252dfb0..d3baec790195d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,6 +33,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed distributed setting and `ddp_cpu` only with `num_processes>1` ([#5297](https://github.com/PyTorchLightning/pytorch-lightning/pull/5297))
+
 
 ## [1.1.0] - 2020-12-09
 

From 713bc040f70b80dae677be57fca2c282c951499e Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 4 Jan 2021 14:48:57 +0100
Subject: [PATCH 24/25] Apply suggestions from code review

---
 benchmarks/test_sharded_parity.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 1a9aed58b97c4..0f58cb882bcf9 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -247,7 +247,6 @@ def plugin_parity_test(
         gpus: int = 0,
         precision: int = 32,
         max_percent_speed_diff: float = 0.1,
-        **kwargs,
 ):
     """
     Ensures that the trained model is identical to the standard DDP implementation.
@@ -276,7 +275,6 @@ def plugin_parity_test(
         gpus=gpus,
         precision=precision,
         accelerator=accelerator,
-        **kwargs,
     )
 
     max_memory_ddp, ddp_time = record_ddp_fit_model_stats(
@@ -296,7 +294,6 @@ def plugin_parity_test(
         precision=precision,
         accelerator=accelerator,
         plugins=[plugin],
-        **kwargs,
     )
 
     max_memory_custom, custom_model_time = record_ddp_fit_model_stats(

From ad7fdee3f78523f31eb114b1b8cecbcee5a6bcdb Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 4 Jan 2021 17:06:29 +0100
Subject: [PATCH 25/25] .

---
 pytorch_lightning/plugins/plugin_connector.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/plugin_connector.py
index 596a630b1c959..ccd128d87a26a 100644
--- a/pytorch_lightning/plugins/plugin_connector.py
+++ b/pytorch_lightning/plugins/plugin_connector.py
@@ -31,8 +31,6 @@ def __init__(self, trainer):
         self.plugins = []
         self.ddp_plugin = DDPPlugin()
         self.cloud_environment = None
-        # self.amp_plugin = NativeAMPPlugin(trainer)
-        # self.apex_plugin = ApexPlugin(trainer)
 
     def on_trainer_init(self, plugins: Optional[Union[str, list]]):
         self.plugins = plugins