From fddeee3a0facb50639f94fbcaa90ca221381e9c1 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 9 Nov 2020 17:19:02 +0100
Subject: [PATCH 001/157] move to old package

---
 .../accelerators/{ => old}/__init__.py        |   0
 .../accelerators/old/accelerator.py           | 259 ++++++++++++++++++
 .../{ => old}/accelerator_connector.py        |   0
 .../accelerators/{ => old}/cpu_accelerator.py |   0
 .../{ => old}/ddp2_accelerator.py             |   0
 .../accelerators/{ => old}/ddp_accelerator.py |   0
 .../{ => old}/ddp_cpu_hpc_accelerator.py      |   0
 .../{ => old}/ddp_cpu_spawn_accelerator.py    |   0
 .../{ => old}/ddp_hpc_accelerator.py          |   0
 .../{ => old}/ddp_spawn_accelerator.py        |   0
 .../accelerators/{ => old}/dp_accelerator.py  |   0
 .../accelerators/{ => old}/gpu_accelerator.py |   0
 .../{ => old}/horovod_accelerator.py          |   0
 .../accelerators/{ => old}/tpu_accelerator.py |   0
 14 files changed, 259 insertions(+)
 rename pytorch_lightning/accelerators/{ => old}/__init__.py (100%)
 create mode 100644 pytorch_lightning/accelerators/old/accelerator.py
 rename pytorch_lightning/accelerators/{ => old}/accelerator_connector.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/cpu_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/ddp2_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/ddp_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/ddp_cpu_hpc_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/ddp_cpu_spawn_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/ddp_hpc_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/ddp_spawn_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/dp_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/gpu_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/horovod_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{ => old}/tpu_accelerator.py (100%)

diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/old/__init__.py
similarity index 100%
rename from pytorch_lightning/accelerators/__init__.py
rename to pytorch_lightning/accelerators/old/__init__.py
diff --git a/pytorch_lightning/accelerators/old/accelerator.py b/pytorch_lightning/accelerators/old/accelerator.py
new file mode 100644
index 0000000000000..b16e0125054bb
--- /dev/null
+++ b/pytorch_lightning/accelerators/old/accelerator.py
@@ -0,0 +1,259 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import math
+from enum import Enum
+from pytorch_lightning.core.lightning import LightningModule
+from typing import Any, Optional, Union
+
+import torch
+
+from pytorch_lightning.utilities import AMPType, rank_zero_warn
+from pytorch_lightning.utilities.apply_func import move_data_to_device
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.parsing import AttributeDict
+import torch.distributed as torch_distrib
+from pytorch_lightning import _logger as log
+
+try:
+    from apex import amp
+except ImportError:
+    amp = None
+
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+else:
+
+    class ReduceOp:
+        SUM = None
+
+
+EPSILON = 1e-6
+EPSILON_FP16 = 1e-5
+
+
+class Accelerator(object):
+    def __init__(self, trainer=None, cluster_environment=None, ddp_plugin=None):
+        self.trainer = trainer
+        self.nickname = None
+        self.cluster_environment = cluster_environment
+        self.dist = AttributeDict(rank=0, device=None)
+        self.ddp_plugin = ddp_plugin
+
+        if trainer is not None:
+            self.train_loop = self.trainer.train
+            self.validation_loop = self.trainer.run_evaluation
+            self.test_loop = self.trainer.run_evaluation
+
+    def setup(self, model):
+        pass
+
+    def teardown(self):
+        # Ensure if necessary all processes are finished
+        self.barrier()
+
+    def barrier(self, name: Optional[str] = None):
+        pass
+
+    def broadcast(self, obj, src=0):
+        return obj
+
+    def train_or_test(self):
+        if self.trainer.testing:
+            results = self.trainer.run_test()
+        else:
+            results = self.trainer.train()
+        return results
+
+    def batch_to_device(self, batch: Any, device: torch.device):
+        model = self.trainer.get_model()
+        if model is not None:
+            return model.transfer_batch_to_device(batch, device)
+        return move_data_to_device(batch, device)
+
+    def training_step_end(self, output):
+        return output
+
+    def test_step_end(self, output):
+        return output
+
+    def validation_step_end(self, output):
+        return output
+
+    def process_dataloader(self, dataloader):
+        return dataloader
+
+    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+        if self.trainer.precision == 16:
+            closure_loss = self.trainer.precision_connector.backend.backward(
+                closure_loss, optimizer, opt_idx, *args, **kwargs
+            )
+        else:
+            # do backward pass
+            model = self.trainer.get_model()
+            model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
+
+            # once backward has been applied, release graph
+            closure_loss = closure_loss.detach()
+        return closure_loss
+
+    def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
+        model_ref = self.trainer.get_model()
+        is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
+        native_amp = self.trainer.amp_backend == AMPType.NATIVE
+
+        # native amp + lbfgs is a no go right now
+        if native_amp and is_lbfgs:
+            raise MisconfigurationException(
+                "native PyTorch amp and lbfgs are not compatible."
+                " To request, please file a Github issue in PyTorch and tag @mcarilli"
+            )
+
+        # model hook
+        model_ref.optimizer_step(
+            epoch=self.trainer.current_epoch,
+            batch_idx=batch_idx,
+            optimizer=optimizer,
+            optimizer_idx=opt_idx,
+            optimizer_closure=lambda_closure,
+            on_tpu=False,  # TPUAccelerator class sets this as True
+            using_native_amp=native_amp,
+            using_lbfgs=is_lbfgs,
+        )
+
+        # scale when native amp
+        if native_amp:
+            self.trainer.scaler.update()
+
+    def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
+        model_ref = self.trainer.get_model()
+        model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
+
+    def clip_gradients(self, optimizer, clip_val=None):
+        # TODO: separate TPU case from here
+        self._clip_gradients(optimizer, clip_val)
+
+    def _clip_gradients(self, optimizer, clip_val=None):
+        # use the trainer's clip val if none passed
+        grad_clip_val = self.trainer.gradient_clip_val
+        if clip_val is not None:
+            grad_clip_val = clip_val
+        grad_clip_val = float(grad_clip_val)
+
+        # this code is a modification of torch.nn.utils.clip_grad_norm_
+        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
+        if grad_clip_val <= 0:
+            return
+
+        model = self.trainer.get_model()
+        if self.trainer.amp_backend == AMPType.APEX:
+            parameters = amp.master_params(optimizer)
+        else:
+            parameters = model.parameters()
+
+        max_norm = grad_clip_val
+        norm_type = float(2.0)
+
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+        if norm_type == math.inf:
+            total_norm = max(p.grad.data.abs().max() for p in parameters)
+        else:
+            device = parameters[0].device
+            out = torch.empty(len(parameters), device=device)
+            for i, p in enumerate(parameters):
+                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
+            total_norm = torch.norm(out, norm_type)
+
+        eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON
+        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
+        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
+        for p in parameters:
+            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
+
+    def on_train_epoch_end(self, outputs):
+        pass
+
+    def on_train_end(self):
+        pass
+
+    def early_stopping_should_stop(self, pl_module):
+        return self.trainer.should_stop
+
+    def setup_optimizers(self, model):
+        if self.trainer.testing is True:
+            return
+
+        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
+        self.trainer.optimizers = optimizers
+        self.trainer.lr_schedulers = lr_schedulers
+        self.trainer.optimizer_frequencies = optimizer_frequencies
+
+    def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None:
+        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
+        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
+        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
+
+        if not torch.distributed.is_initialized():
+            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
+            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
+
+    def sync_tensor(
+        self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
+    ) -> torch.Tensor:
+        """
+        Function to reduce a tensor from several distributed processes to one aggregated tensor.
+
+        Args:
+            tensor: the tensor to sync and reduce
+            group: the process group to gather results from. Defaults to all processes (world)
+            reduce_op: the reduction operation. Defaults to sum.
+                Can also be a string of 'avg', 'mean' to calculate the mean during reduction.
+
+        Return:
+            reduced value
+        """
+        raise NotImplementedError()
+
+    def __getstate__(self):
+        return {
+            "trainer": self.trainer,
+            "nickname": self.nickname,
+            "cluster_environment": self.cluster_environment,
+            "dist": self.dist,
+            "ddp_plugin": self.ddp_plugin,
+        }
+
+    def __setstate__(self, d):
+        self.trainer = d["trainer"]
+        self.nickname = d["nickname"]
+        self.cluster_environment = d["cluster_environment"]
+        self.dist = d["dist"]
+        self.ddp_plugin = d["ddp_plugin"]
+
+
+# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos...
+class BackendType(Enum):
+    DP = "dp"
+    DDP = "ddp"
+    DDP2 = "ddp2"
+    DDP_SPAWN = "ddp_spawn"
+    # decuple distrib and device
+    DDP_CPU = "ddp_cpu"
+    HOROVOD = "horovod"
+    # this is rather device
+    TPU = "tpu"
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/old/accelerator_connector.py
similarity index 100%
rename from pytorch_lightning/accelerators/accelerator_connector.py
rename to pytorch_lightning/accelerators/old/accelerator_connector.py
diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/old/cpu_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/cpu_accelerator.py
rename to pytorch_lightning/accelerators/old/cpu_accelerator.py
diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/old/ddp2_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/ddp2_accelerator.py
rename to pytorch_lightning/accelerators/old/ddp2_accelerator.py
diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/old/ddp_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/ddp_accelerator.py
rename to pytorch_lightning/accelerators/old/ddp_accelerator.py
diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
rename to pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py
diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
rename to pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/ddp_hpc_accelerator.py
rename to pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py
diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/ddp_spawn_accelerator.py
rename to pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py
diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/old/dp_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/dp_accelerator.py
rename to pytorch_lightning/accelerators/old/dp_accelerator.py
diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/old/gpu_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/gpu_accelerator.py
rename to pytorch_lightning/accelerators/old/gpu_accelerator.py
diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/old/horovod_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/horovod_accelerator.py
rename to pytorch_lightning/accelerators/old/horovod_accelerator.py
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/old/tpu_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/tpu_accelerator.py
rename to pytorch_lightning/accelerators/old/tpu_accelerator.py

From f9c1e8d557d02ffd5dd1c774e8403d1a743a798c Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 9 Nov 2020 17:19:18 +0100
Subject: [PATCH 002/157] add initial draft of new accelerators

---
 pytorch_lightning/accelerators/accelerator.py | 333 ++++++++----------
 1 file changed, 141 insertions(+), 192 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 1b3ae6f23058a..3d1b5038dcc20 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,79 +1,69 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from contextlib import contextmanager
-from typing import Any, Optional, Union
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities import AMPType
+from typing import Any, Union
+import math
 
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities.apply_func import move_data_to_device
-from pytorch_lightning.utilities.parsing import AttributeDict
+from pytorch_lightning.core import LightningModule
+from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin, PrecisionPlugin
 
-if torch.distributed.is_available():
-    from torch.distributed import ReduceOp
-else:
-    class ReduceOp:
-        SUM = None
+from pytorch_lightning.utilities.apply_func import move_data_to_device
 
 
-class Accelerator(object):
+class NewAccelerator(object):
+    root_device: Union[str, torch.device]
 
-    def __init__(self,
-                 trainer: Optional = None,
-                 cluster_environment: Optional[ClusterEnvironment] = None,
-                 ddp_plugin: Optional[DDPPlugin] = None):
-        self.trainer = trainer
-        self.nickname = None
-        self.cluster_environment = cluster_environment
-        self.dist = AttributeDict(rank=0, device=None)
-        self.ddp_plugin = ddp_plugin
+    def __init__(
+        self,
+        model_ref: LightningModule,
+        root_device: Union[str, torch.device],
+        precision_plugin: PrecisionPlugin,
+        gradient_clip_val,
+    ):
+        self.model_ref = model_ref
+        self.precision_plugin = precision_plugin
+        self.gradient_clip_val = gradient_clip_val
 
-        if trainer is not None:
-            self.train_loop = self.trainer.train
-            self.validation_loop = self.trainer.run_evaluation
-            self.test_loop = self.trainer.run_evaluation
+        self.optimizers = None
+        self.lr_schedulers = None
+        self.optimizer_frequencies = None
+        self.root_device = root_device
 
     def setup(self, model):
-        pass
+        self.setup_optimizers(model)
+        self.connect_precision_plugin()
 
     def teardown(self):
-        # Ensure if necessary all processes are finished
-        self.barrier()
-
-    def barrier(self, name: Optional[str] = None):
         pass
 
-    def broadcast(self, obj, src=0):
-        return obj
-
-    def train_or_test(self):
-        if self.trainer.testing:
-            results = self.trainer.run_test()
-        else:
-            results = self.trainer.train()
-        return results
-
     def batch_to_device(self, batch: Any, device: torch.device):
-        model = self.trainer.get_model()
+        model = self.model_ref
         if model is not None:
             return model.transfer_batch_to_device(batch, device)
         return move_data_to_device(batch, device)
 
+    def training_step(self, args):
+        batch = self.to_device(args[0])
+
+        args[0] = batch
+
+        return self.model_ref.training_step(*args)
+
+    def validation_step(self, args):
+        batch = self.to_device(args[0])
+
+        args[0] = batch
+
+        return self.model_ref.validation_step(*args)
+
+    def test_step(self, args):
+        batch = self.to_device(args[0])
+
+        args[0] = batch
+        return self.model_ref.test_step(*args)
+
     def training_step_end(self, output):
         return output
 
@@ -87,28 +77,36 @@ def process_dataloader(self, dataloader):
         return dataloader
 
     def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
-        automatic_optimization = self.trainer.train_loop.automatic_optimization
-
-        if not automatic_optimization and self.ddp_plugin is not None:
-            # Manually prepare for reduce as user calling backwards manually
-            self.ddp_plugin.on_before_manual_backward(self.trainer.model, closure_loss)
+        return self.precision_plugin.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
+
+    def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
+        model_ref = self.model_ref
+        is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
+        native_amp = self.trainer.amp_backend == AMPType.NATIVE
+
+        self.precision_plugin.pre_optimizer_step(optimizer)
+
+        # model hook
+        model_ref.optimizer_step(
+            epoch=current_epoch,
+            batch_idx=batch_idx,
+            optimizer=optimizer,
+            optimizer_idx=opt_idx,
+            optimizer_closure=lambda_closure,
+            on_tpu=False,  # TPUAccelerator class sets this as True
+            using_native_amp=native_amp,
+            using_lbfgs=is_lbfgs,
+        )
 
-        if self.trainer.precision == 16:
-            closure_loss = self.trainer.precision_connector.backend.backward(
-                closure_loss, optimizer, opt_idx, *args, **kwargs
-            )
-        else:
-            # do backward pass
-            model = self.trainer.get_model()
-            model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
+        self.precision_plugin.post_optimizer_step()
 
-            # once backward has been applied, release graph
-            closure_loss = closure_loss.detach()
-        return closure_loss
+    def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx):
+        model_ref = self.model_ref
+        model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx)
 
     def clip_gradients(self, optimizer, clip_val=None):
         # use the trainer's clip val if none passed
-        grad_clip_val = self.trainer.gradient_clip_val
+        grad_clip_val = self.gradient_clip_val
         if clip_val is not None:
             grad_clip_val = clip_val
         grad_clip_val = float(grad_clip_val)
@@ -117,12 +115,37 @@ def clip_gradients(self, optimizer, clip_val=None):
             return
         self._clip_gradients(optimizer, grad_clip_val)
 
-    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
-        if self.trainer.amp_backend:
-            self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type)
+        model = self.model_ref
+
+        # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX
+        if self.trainer.amp_backend == AMPType.APEX:
+            parameters = self.precision_plugin.master_params(optimizer)
         else:
-            model = self.trainer.get_model()
-            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
+            parameters = model.parameters()
+
+        max_norm = grad_clip_val
+        norm_type = float(2.0)
+
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+        device = parameters[0].device
+
+        if norm_type == math.inf:
+            total_norm = max(p.grad.data.abs().max() for p in parameters)
+        else:
+            out = torch.empty(len(parameters), device=device)
+            for i, p in enumerate(parameters):
+                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
+            total_norm = torch.norm(out, norm_type)
+
+        eps = self.precision_plugin.EPSILON
+
+        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
+        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
+        for p in parameters:
+            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
 
     def on_train_epoch_end(self, outputs):
         pass
@@ -130,126 +153,52 @@ def on_train_epoch_end(self, outputs):
     def on_train_end(self):
         pass
 
+    # TODO: Check if we can change logic for early stopping to accelerator/trainer completely or have a separate connector (should be self contained)
     def early_stopping_should_stop(self, pl_module):
         return self.trainer.should_stop
 
     def setup_optimizers(self, model):
-        if self.trainer.testing:
+        # TODO: Check if we can change logic for early stopping to trainer completely (should be self contained)
+        if self.trainer.testing is True:
             return
 
         optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
-        self.trainer.optimizers = optimizers
-        self.trainer.lr_schedulers = lr_schedulers
-        self.trainer.optimizer_frequencies = optimizer_frequencies
-
-    def init_ddp_connection(
-            self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True
-    ) -> None:
-        self.ddp_plugin.init_ddp_connection(
-            self.trainer,
-            self.cluster_environment,
-            global_rank,
-            world_size,
-            is_slurm_managing_tasks,
+        self.optimizers = optimizers
+        self.lr_schedulers = lr_schedulers
+        self.optimizer_frequencies = optimizer_frequencies
+
+    def connect_precision_plugin(self):
+        model, optimizers, schedulers = self.precision_plugin.connect(
+            self.model_ref, self.optimizers, self.lr_schedulers
         )
 
-    def sync_tensor(self,
-                    tensor: Union[torch.Tensor],
-                    group: Optional[Any] = None,
-                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
-        """
-        Function to reduce a tensor from several distributed processes to one aggregated tensor.
-
-        Args:
-            tensor: the tensor to sync and reduce
-            group: the process group to gather results from. Defaults to all processes (world)
-            reduce_op: the reduction operation. Defaults to sum.
-                Can also be a string of 'avg', 'mean' to calculate the mean during reduction.
-
-        Return:
-            reduced value
-        """
-        raise NotImplementedError()
-
-    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
-        """
-        Function to gather a tensor from several distributed processes
-
-        Args:
-            tensor: tensor of shape (batch, ...)
-            group: the process group to gather results from. Defaults to all processes (world)
-            sync_grads: flag that allows users to synchronize gradients for all_gather op
-
-        Return:
-            A tensor of shape (world_size, batch, ...)
-        """
-        raise NotImplementedError()
-
-    def optimizer_state(self, optimizer: Optimizer) -> dict:
-        """
-        Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom
-        plugins.
-        Return:
-            Optimizer state dict
-        """
-        if self.ddp_plugin:
-            return self.ddp_plugin.optimizer_state(optimizer)
-        return optimizer.state_dict()
-
-    def get_reference_model(self, model) -> LightningModule:
-        """
-        Override to modify returning base :class:`LightningModule`
-        when accessing variable and functions if the accelerator has wrapped the model.
-
-        Example::
-            ref_model = accelerator.get_reference_model(model)
-            ref_model.training_step(...)
-
-        Args:
-            model: Accelerator model.
-
-        Returns: Reference :class:`LightningModule`.
-
-        """
-        return model
-
-    def __getstate__(self):
-        return {
-            'trainer': self.trainer,
-            'nickname': self.nickname,
-            'cluster_environment': self.cluster_environment,
-            'dist': self.dist,
-            'ddp_plugin': self.ddp_plugin
-        }
-
-    def __setstate__(self, d):
-        self.trainer = d['trainer']
-        self.nickname = d['nickname']
-        self.cluster_environment = d['cluster_environment']
-        self.dist = d['dist']
-        self.ddp_plugin = d['ddp_plugin']
-
-    def on_save(self, checkpoint):
-        return checkpoint
-
-    @property
-    def rpc_enabled(self):
-        return self.ddp_plugin is not None and isinstance(self.ddp_plugin, RPCPlugin)
-
-    @property
-    def distributed_sampler_kwargs(self):
-        raise NotImplementedError
-
-    @property
-    def require_distributed_sampler(self):
-        raise NotImplementedError
-
-    @contextmanager
-    def block_ddp_plugin_sync_behaviour(self):
-        """
-        Blocks ddp sync gradients behaviour on backwards pass.
-        This is useful for skipping sync when accumulating gradients, reducing communication overhead
-        Returns: context manager with sync behaviour off
-        """
-        cm = self.ddp_plugin.block_backward_sync(self.trainer.model) if self.ddp_plugin else None
-        yield cm
+        self.model_ref = model
+        self.optimizers = optimizers
+        self.schedulers = schedulers
+
+    def to_device(self, batch):
+        return self.batch_to_device(batch, self.root_device)
+
+
+class NewCPUAccelerator(NewAccelerator):
+    def setup(self, model):
+        if isinstance(self.precision_plugin, MixedPrecisionPlugin):
+            MisconfigurationException("amp + cpu is not supported.  Please use a GPU option")
+
+        if "cpu" not in str(self.root_device):
+            raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead")
+
+        return super().setup(model)
+
+
+class NewGPUAccelerator(NewAccelerator):
+    def setup(self, model):
+        if "cuda" not in str(self.root_device):
+            raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
+        torch.cuda.set_device(self.root_device)
+        self.model_ref.to(self.root_device)
+
+        return super().setup(model)
+
+
+# TODO: Add NewTPUAccelerator

From 28ae4037ead0723f006e4cef2d6e30fb45dacf25 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 9 Nov 2020 17:19:30 +0100
Subject: [PATCH 003/157] add initial data parallel draft

---
 .../accelerators/data_parallel.py             | 325 ++++++++++++++++++
 1 file changed, 325 insertions(+)
 create mode 100644 pytorch_lightning/accelerators/data_parallel.py

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
new file mode 100644
index 0000000000000..9a6481c65c5db
--- /dev/null
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -0,0 +1,325 @@
+from abc import ABC, abstractmethod
+
+from torch.nn.parallel.distributed import DistributedDataParallel
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.utilities.seed import seed_everything
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.distributed.dist import LightningDistributed
+import torch
+import os
+from pytorch_lightning.core.step_result import Result
+from typing import Any, Dict, List, Optional, Union
+from pytorch_lightning.overrides.data_parallel import LightningDataParallel, LightningDistributedDataParallel
+from torch.nn.parallel.data_parallel import DataParallel
+import sys
+from os.path import abspath
+from time import sleep
+import subprocess
+from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only
+import numpy as np
+import torch.distributed as torch_distrib
+from pytorch_lightning import _logger as log
+
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
+
+try:
+    from hydra.utils import to_absolute_path, get_original_cwd
+    from hydra.core.hydra_config import HydraConfig
+except ImportError:
+    HYDRA_AVAILABLE = False
+else:
+    HYDRA_AVAILABLE = True
+
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+else:
+
+    class ReduceOp:
+        SUM = None
+
+
+class ParallelPlugin(ABC):
+    def __init__(self):
+        self.model = None
+
+    @abstractmethod
+    def reduce(self, output):
+        raise NotImplementedError
+
+    @abstractmethod
+    @property
+    def root_device(self):
+        raise NotImplementedError
+
+
+class DataParallelPlugin(ParallelPlugin):
+    def __init__(self, parallel_device_ids):
+        super().__init__()
+        self.parallel_device_ids = parallel_device_ids
+
+    def setup(self, model):
+        self.model = LightningDataParallel(model, self.parallel_device_ids)
+
+    def reduce(self, output):
+        if isinstance(output, Result):
+            output.dp_reduce()
+
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
+
+        return output
+
+    @property
+    def root_device(self):
+        return self.parallel_device_ids[0]
+
+
+class DistributedDataParallelPlugin(ParallelPlugin):
+    def __init__(self, parallel_device_ids, num_nodes, num_processes, **ddp_kwargs):
+        super().__init__(self)
+
+        self.task_idx = None
+        self._has_spawned_children = False
+        self.interactive_ddp_procs = []
+        self.dist = LightningDistributed()
+        self.parallel_device_ids = parallel_device_ids
+        self.num_nodes = num_nodes
+        self.num_processes = num_processes
+        self._ddp_kwargs: Dict[str, Any] = ddp_kwargs
+
+    def setup(self, model):
+        # start the other scripts
+        if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
+            self._call_children_scripts()
+
+        # set the task idx
+        self.task_idx = int(os.environ["LOCAL_RANK"])
+
+    def _call_children_scripts(self):
+        assert self.trainer.global_rank == 0
+        self._check_can_spawn_children()
+        self._has_spawned_children = True
+
+        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
+        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+
+        # allow the user to pass the node rank
+        node_rank = "0"
+        node_rank = os.environ.get("NODE_RANK", node_rank)
+        node_rank = os.environ.get("GROUP_RANK", node_rank)
+        os.environ["NODE_RANK"] = node_rank
+        os.environ["LOCAL_RANK"] = "0"
+
+        # when user is using hydra find the absolute path
+        path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path
+
+        # pull out the commands used to run the script and resolve the abs file path
+        command = sys.argv
+        try:
+            full_path = path_lib(command[0])
+        except Exception as e:
+            full_path = abspath(command[0])
+
+        command[0] = full_path
+        # use the same python interpreter and actually running
+        command = [sys.executable] + command
+
+        # the visible devices tell us how many GPUs we want to use.
+        # when the trainer script was called the device has already been scoped by the time
+        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
+        # but forward the GPUs selected via environment variables
+        if self.trainer.data_parallel_device_ids is None:
+            raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")
+
+        os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids])
+        os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
+
+        # TODO: Change t
+        if self.trainer.logger is not None:
+            os.environ["PL_EXP_VERSION"] = str(self.trainer.logger.version)
+
+        num_gpus = len(self.parallel_device_ids)
+        os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
+
+        self.interactive_ddp_procs = []
+        for local_rank in range(1, self.num_processes):
+            env_copy = os.environ.copy()
+            env_copy["LOCAL_RANK"] = f"{local_rank}"
+
+            # remove env var if global seed not set
+            if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
+                del env_copy["PL_GLOBAL_SEED"]
+
+            # start process
+            # if hydra is available and initialized, make sure to set the cwd correctly
+            cwd: Optional[str] = None
+            if HYDRA_AVAILABLE:
+                if HydraConfig.initialized():
+                    cwd = get_original_cwd()
+            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
+            self.interactive_ddp_procs.append(proc)
+
+            # starting all processes at once can cause issues
+            # with dataloaders delay between 1-10 seconds
+            delay = np.random.uniform(1, 5, 1)[0]
+            sleep(delay)
+
+    def barrier(self, name: Optional[str] = None):
+        if torch_distrib.is_initialized():
+            torch_distrib.barrier()
+
+    # TODO: Refactor This! Not sure we still need the whole method here. Should be dione with some additional setup and cleaning logic
+    def ddp_train(self, process_idx, model):
+        """
+        Entry point for ddp
+
+        Args:
+            process_idx:
+            mp_queue: multiprocessing queue
+            model:
+
+        Returns:
+            Dict with evaluation results
+
+        """
+        seed = os.environ.get("PL_GLOBAL_SEED")
+        if seed is not None:
+            seed_everything(int(seed))
+
+        # show progressbar only on progress_rank 0
+        if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
+            self.trainer.progress_bar_callback.disable()
+
+        # determine which process we are and world size
+        self.set_world_ranks(process_idx)
+
+        # set warning rank
+        rank_zero_only.rank = self.trainer.global_rank
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        model.trainer = self.trainer
+        self.init_ddp_connection(
+            self.trainer.global_rank,
+            self.trainer.world_size,
+            self.trainer.is_slurm_managing_tasks
+        )
+
+        # call setup after the ddp process has connected
+        self.trainer.call_setup_hook(model)
+
+        # on world_size=0 let everyone know training is starting
+        if self.trainer.is_global_zero and not torch.distributed.is_initialized():
+            log.info('-' * 100)
+            log.info(f'distributed_backend={self.trainer.distributed_backend}')
+            log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
+            log.info('-' * 100)
+
+        # call sync_bn before .cuda(), configure_apex and configure_ddp
+        if self.trainer.sync_batchnorm:
+            model = self.configure_sync_batchnorm(model)
+
+        # move the model to the correct device
+        self.model_to_device(model, process_idx)
+
+        # CHOOSE OPTIMIZER
+        # allow for lr schedulers as well
+        self.setup_optimizers(model)
+
+        # set model properties before going into wrapper
+        self.trainer.model_connector.copy_trainer_model_properties(model)
+
+        # 16-bit
+        model = self.trainer.precision_connector.connect(model)
+
+        # device ids change depending on the DDP setup
+        device_ids = self.get_device_ids()
+
+        # allow user to configure ddp
+        model = self.configure_ddp(model, device_ids)
+
+        # set up training routine
+        self.barrier('ddp_setup')
+        self.trainer.train_loop.setup_training(model)
+
+        # train or test
+        results = self.train_or_test()
+
+        # clean up memory
+        torch.cuda.empty_cache()
+
+        return results
+
+    def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None:
+        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
+        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
+        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
+
+        if not torch.distributed.is_initialized():
+            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
+            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
+
+    def configure_ddp(
+        self, model: LightningModule, device_ids: List[int]
+    ) -> LightningDistributedDataParallel:
+        """
+        Pass through all customizations from constructor to `LightningDistributedDataParallel`.
+        Override to define a custom DDP implementation.
+
+        .. note:: Only requirement is that your DDP implementation subclasses LightningDistributedDataParallel
+
+
+        The default implementation is::
+
+            def configure_ddp(self, model, device_ids):
+                model = LightningDistributedDataParallel(
+                    model, device_ids=device_ids, find_unused_parameters=True
+                )
+                return model
+
+        Args:
+            model: the lightningModule
+            device_ids: the list of devices available
+
+        Returns:
+            the model wrapped in LightningDistributedDataParallel
+
+        """
+        # if unset, default `find_unused_parameters` `True`
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get(
+            "find_unused_parameters", True
+        )
+        model = LightningDistributedDataParallel(
+            model,
+            device_ids=device_ids,
+            **self._ddp_kwargs,
+        )
+        return model
+
+    def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
+        """
+        Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override to synchronize batchnorm between specific process groups instead
+        of the whole world or use a different sync_bn like `apex`'s version.
+
+        Args:
+            model: pointer to current :class:`LightningModule`.
+
+        Return:
+            LightningModule with batchnorm layers synchronized between process groups
+        """
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
+
+        return model
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        """
+
+        """
+        return sync_ddp_if_available(tensor, group, reduce_op)

From fe7573f812d8783a3d9ea91658687f174e56ef38 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 9 Nov 2020 17:19:39 +0100
Subject: [PATCH 004/157] add initial precision draft

---
 pytorch_lightning/accelerators/precision.py | 150 ++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 pytorch_lightning/accelerators/precision.py

diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
new file mode 100644
index 0000000000000..19a375272e95f
--- /dev/null
+++ b/pytorch_lightning/accelerators/precision.py
@@ -0,0 +1,150 @@
+from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties
+from pytorch_lightning.core.lightning import LightningModule
+from typing import List, Tuple
+import torch
+from torch.optim import Optimizer
+
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities import AMPType, rank_zero_warn
+
+try:
+    from apex import amp
+except ImportError:
+    amp = None
+
+
+class PrecisionPlugin(object):
+    EPSILON = 1e-6
+    precision = 32
+
+    def pre_optimizer_step(self, optimizer, optiizer_idx):
+        pass
+
+    def post_optimizer_step(self, optimizer, optimizer_idx):
+        pass
+
+    def master_params(self, optimizer):
+        for group in optimizer.param_groups:
+            for p in group["params"]:
+                yield p
+
+    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
+        return model, optimizers, lr_schedulers
+
+
+class MixedPrecisionPlugin(PrecisionPlugin):
+    EPSILON = 1e-5
+    backend: AMPType
+    precision = "mixed"
+
+
+class NativeMixedPrecisionPlugin(MixedPrecisionPlugin):
+    def __init__(self):
+        self.backend = AMPType.NATIVE
+        self.scaler = torch.cuda.amp.GradScaler()
+
+    def pre_optimizer_step(self, optimizer, optimizer_idx):
+        if isinstance(optimizer, torch.optim.LBFGS):
+            raise MisconfigurationException(
+                f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})."
+                " To request, please file a Github issue in PyTorch and tag @mcarilli"
+            )
+
+    def post_optimizer_step(self, optimizer, optimizer_idx):
+        self.scaler.update()
+
+    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+        closure_loss = self.scaler.scale(closure_loss)
+
+        # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
+        automatic_optimization = self.trainer.train_loop.automatic_optimization
+
+        # do backward pass
+        if automatic_optimization:
+            model = self.trainer.get_model()
+            model.backward(closure_loss, optimizer, opt_idx)
+        else:
+            closure_loss.backward(*args, **kwargs)
+
+        # once backward has been applied, release graph
+        closure_loss = closure_loss.detach()
+
+        # unscale gradient to allow analyze within `on_after_backward`
+        # TODO: Check from where we can get the should_accumulate value (maybe pass it as argument?)
+        if not self.trainer.train_loop.should_accumulate() and automatic_optimization:
+            self.scaler.unscale_(optimizer)
+
+        return closure_loss
+
+
+class ApexMixedPrecisionPlugin(MixedPrecisionPlugin):
+    def __init__(self):
+        self.backend = AMPType.APEX
+
+    def connect(self, model, optimizers, lr_schedulers):
+        model, optimizers = self.configure_apex(amp, model, optimizers, self.trainer.amp_level)
+        reinit_scheduler_properties(optimizers, lr_schedulers)
+        return model, optimizers, lr_schedulers
+
+    def training_step(self, fx, args):
+        output = fx(args)
+        return output
+
+    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+        closure_loss = amp.scale_loss(closure_loss, optimizer)
+
+        # enter apex context
+        context = closure_loss
+        closure_loss = closure_loss.__enter__()
+
+        # do backward pass
+        if self.trainer.train_loop.automatic_optimization:
+            model = self.trainer.get_model()
+            model.backward(closure_loss, optimizer, opt_idx)
+        else:
+            closure_loss.backward(*args, **kwargs)
+
+        # exit amp context
+        a, b, c = None, None, None
+        error = context.__exit__(a, b, c)
+        if error:
+            rank_zero_warn(a, b, c)
+            raise Exception("apex unscale error")
+
+        # once backward has been applied, release graph
+        closure_loss = closure_loss.detach()
+        return closure_loss
+
+    def configure_apex(
+        self,
+        amp: object,
+        model: LightningModule,
+        optimizers: List[Optimizer],
+        amp_level: str,
+    ) -> Tuple[LightningModule, List[Optimizer]]:
+        r"""
+        Override to init AMP your own way.
+        Must return a model and list of optimizers.
+
+        Args:
+            amp: pointer to amp library object.
+            model: pointer to current :class:`LightningModule`.
+            optimizers: list of optimizers passed in :meth:`configure_optimizers`.
+            amp_level: AMP mode chosen ('O1', 'O2', etc...)
+
+        Return:
+            Apex wrapped model and optimizers
+
+        Examples:
+            .. code-block:: python
+
+                # Default implementation used by Trainer.
+                def configure_apex(self, amp, model, optimizers, amp_level):
+                    model, optimizers = amp.initialize(
+                        model, optimizers, opt_level=amp_level,
+                    )
+
+                    return model, optimizers
+        """
+        model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)
+        return model, optimizers
\ No newline at end of file

From 9fd48a1cdf7d9946b74e6d4b6e04c75a2d52869d Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 9 Nov 2020 17:19:48 +0100
Subject: [PATCH 005/157] scheduler helper functions

---
 .../accelerators/scheduler_properties.py      | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 pytorch_lightning/accelerators/scheduler_properties.py

diff --git a/pytorch_lightning/accelerators/scheduler_properties.py b/pytorch_lightning/accelerators/scheduler_properties.py
new file mode 100644
index 0000000000000..6835df4499385
--- /dev/null
+++ b/pytorch_lightning/accelerators/scheduler_properties.py
@@ -0,0 +1,25 @@
+from torch import optim
+
+
+def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
+        # Reinitialize optimizer.step properties added by schedulers
+        for scheduler in schedulers:
+            scheduler = scheduler['scheduler']
+
+            for optimizer in optimizers:
+                state = None
+                idx = 0
+
+                # check that we dont mix users optimizers and schedulers
+                if scheduler.optimizer == optimizer:
+                    # Find the mro belonging to the base lr scheduler class
+                    for i, mro in enumerate(scheduler.__class__.__mro__):
+                        if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau):
+                            idx = i
+                            state = scheduler.state_dict()
+                        else:
+                            state = None
+
+                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
+                if state is not None:
+                    scheduler.load_state_dict(state)
\ No newline at end of file

From b961aaf054bda242a361cba30d31ae776588b029 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 11 Nov 2020 16:58:23 +0100
Subject: [PATCH 006/157] define base plugin api

---
 pytorch_lightning/accelerators/base_plugin.py | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 pytorch_lightning/accelerators/base_plugin.py

diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py
new file mode 100644
index 0000000000000..acd90e41f60df
--- /dev/null
+++ b/pytorch_lightning/accelerators/base_plugin.py
@@ -0,0 +1,31 @@
+import contextlib
+import torch
+
+class Plugin(object):
+
+    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
+        return model, optimizers, lr_schedulers
+        
+    def pre_optimizer_step(self, optimizer, optiizer_idx):
+        pass
+
+    def post_optimizer_step(self, optimizer, optimizer_idx):
+        pass
+
+    def pre_training(self):
+        pass
+
+    def post_training(self):
+        pass
+
+    @contextlib.contextmanager
+    def train_step_context(self):
+        yield
+
+    @contextlib.contextmanager
+    def val_step_context(self):
+        yield
+
+    @contextlib.contextmanager
+    def test_step_context(self):
+        yield
\ No newline at end of file

From 532ad5dcaeb6599629b4e33aa87b30292b8508f0 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 11 Nov 2020 16:58:32 +0100
Subject: [PATCH 007/157] base plugin integration

---
 pytorch_lightning/accelerators/accelerator.py | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 3d1b5038dcc20..ccfc093fde5a5 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,3 +1,5 @@
+from pytorch_lightning.accelerators.data_parallel import ParallelPlugin
+from pytorch_lightning.accelerators.base_plugin import Plugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities import AMPType
 from typing import Any, Union
@@ -20,10 +22,12 @@ def __init__(
         model_ref: LightningModule,
         root_device: Union[str, torch.device],
         precision_plugin: PrecisionPlugin,
+        parallel_plugin: ParallelPlugin,
         gradient_clip_val,
     ):
         self.model_ref = model_ref
         self.precision_plugin = precision_plugin
+        self.parallel_plugin = parallel_plugin
         self.gradient_clip_val = gradient_clip_val
 
         self.optimizers = None
@@ -33,7 +37,8 @@ def __init__(
 
     def setup(self, model):
         self.setup_optimizers(model)
-        self.connect_precision_plugin()
+        self.connect_plugin(self.precision_plugin)
+        self.connect_plugin(self.parallel_plugin)
 
     def teardown(self):
         pass
@@ -49,29 +54,27 @@ def training_step(self, args):
 
         args[0] = batch
 
-        return self.model_ref.training_step(*args)
+        with self.precision_plugin.train_step_context():
+            with self.parallel_plugin.train_step_context():
+                return self.model_ref.training_step(*args)
 
     def validation_step(self, args):
         batch = self.to_device(args[0])
 
         args[0] = batch
 
-        return self.model_ref.validation_step(*args)
+        with self.precision_plugin.val_step_context():
+            with self.parallel_plugin.val_step_context():
+                return self.model_ref.validation_step(*args)
 
     def test_step(self, args):
         batch = self.to_device(args[0])
 
         args[0] = batch
-        return self.model_ref.test_step(*args)
 
-    def training_step_end(self, output):
-        return output
-
-    def test_step_end(self, output):
-        return output
-
-    def validation_step_end(self, output):
-        return output
+        with self.precision_plugin.test_step_context():
+            with self.parallel_plugin.test_step_context():
+                return self.model_ref.test_step(*args)
 
     def process_dataloader(self, dataloader):
         return dataloader
@@ -167,8 +170,8 @@ def setup_optimizers(self, model):
         self.lr_schedulers = lr_schedulers
         self.optimizer_frequencies = optimizer_frequencies
 
-    def connect_precision_plugin(self):
-        model, optimizers, schedulers = self.precision_plugin.connect(
+    def connect_plugin(self, plugin: Plugin):
+        model, optimizers, schedulers = plugin.connect(
             self.model_ref, self.optimizers, self.lr_schedulers
         )
 
@@ -176,6 +179,7 @@ def connect_precision_plugin(self):
         self.optimizers = optimizers
         self.schedulers = schedulers
 
+
     def to_device(self, batch):
         return self.batch_to_device(batch, self.root_device)
 

From f52ad64e5c233aabb664d80bc899bacc1dacfcce Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 11 Nov 2020 16:58:50 +0100
Subject: [PATCH 008/157] continue ddp plugin

---
 .../accelerators/data_parallel.py             | 379 ++++++++++++++++--
 1 file changed, 344 insertions(+), 35 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 9a6481c65c5db..e506041384ad3 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -1,4 +1,7 @@
 from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from os import stat
+from pytorch_lightning.accelerators.base_plugin import Plugin
 
 from torch.nn.parallel.distributed import DistributedDataParallel
 from pytorch_lightning.core.lightning import LightningModule
@@ -19,6 +22,8 @@
 import numpy as np
 import torch.distributed as torch_distrib
 from pytorch_lightning import _logger as log
+import contextlib
+import torch.multiprocessing as mp
 
 from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 
@@ -38,12 +43,15 @@ class ReduceOp:
         SUM = None
 
 
-class ParallelPlugin(ABC):
-    def __init__(self):
+class TrainingTypePlugin(Plugin, ABC):
+    def __init__(self, logger=None):
         self.model = None
+        self.global_rank = 0
+        self.logger = logger
 
     @abstractmethod
-    def reduce(self, output):
+    @property
+    def on_gpu(self):
         raise NotImplementedError
 
     @abstractmethod
@@ -51,12 +59,86 @@ def reduce(self, output):
     def root_device(self):
         raise NotImplementedError
 
+    @abstractmethod
+    def model_to_device(self):
+        raise NotImplementedError
 
-class DataParallelPlugin(ParallelPlugin):
-    def __init__(self, parallel_device_ids):
-        super().__init__()
+    @abstractmethod
+    @property
+    def is_global_zero(self):
+        raise NotImplementedError
+
+    @abstractmethod
+    def barrier(self):
+        raise NotImplementedError
+
+class SingleDevicePlugin(TrainingTypePlugin):
+    def __init__(self, device, logger=None):
+        super().__init__(logger=logger)
+        self.device: torch.device = device
+
+    @property
+    def on_gpu(self):
+        return self.device.type == "cuda" and torch.cuda.is_available()
+
+    def reduce(self, output):
+        return output
+
+    @property
+    def root_device(self):
+        return self.device
+    
+    def model_to_device(self):
+        self.model.to(self.root_device)
+
+    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
+        self.model = model
+
+    @property
+    def is_global_zero(self):
+        return True
+
+    def barrier(self):
+        pass
+
+    
+
+class ParallelPlugin(TrainingTypePlugin, ABC):
+    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None):
+        super().__init__(logger=logger)
         self.parallel_device_ids = parallel_device_ids
+        self.local_rank = 0
+        self.world_size = 1
+        self.cluster_environment = cluster_environment
 
+    @abstractmethod
+    def reduce(self, output):
+        raise NotImplementedError
+
+    @abstractmethod
+    @property
+    def root_device(self):
+        raise NotImplementedError
+
+    @property
+    def on_gpu(self):
+        return self.parallel_device_ids and torch.cuda.is_available()
+
+    @abstractmethod
+    def setup(self, model):
+        raise NotImplementedError
+
+    def connect(self, model):
+        self.setup(model)
+
+        return self.model
+
+    @property
+    def is_global_zero(self) -> bool:
+        return self.global_rank == 0
+
+
+class DataParallelPlugin(ParallelPlugin):
     def setup(self, model):
         self.model = LightningDataParallel(model, self.parallel_device_ids)
 
@@ -73,16 +155,252 @@ def reduce(self, output):
     def root_device(self):
         return self.parallel_device_ids[0]
 
+    def barrier(self):
+        pass
+
+
+class DDPPlugin(ParallelPlugin):
+
+    distributed_backend = "ddp"
+
+    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None) -> None:
+        super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
+        self._has_spawned_children = False
+        self.interactive_ddp_procs = []
+        self.dist = LightningDistributed()
+
+    @property
+    def root_device(self):
+        return self.parallel_device_ids[self.local_rank]
+
+    def setup(self, model):
+
+        self.model = model
+
+        # start the other scripts
+        if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
+            self._call_children_scripts()
+
+        # set the task idx
+        self.task_idx = int(os.environ["LOCAL_RANK"])
+
+    def _call_children_scripts(self):
+
+        # bookkeeping of spawned processes
+        assert self.global_rank == 0
+        self._check_can_spawn_children()
+        self._has_spawned_children = True
+
+        # DDP Environment variables
+        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
+        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+
+        # allow the user to pass the node rank
+        node_rank = "0"
+        node_rank = os.environ.get("NODE_RANK", node_rank)
+        node_rank = os.environ.get("GROUP_RANK", node_rank)
+        os.environ["NODE_RANK"] = node_rank
+        os.environ["LOCAL_RANK"] = "0"
+
+        # when user is using hydra find the absolute path
+        path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path
+
+        # pull out the commands used to run the script and resolve the abs file path
+        command = sys.argv
+        try:
+            full_path = path_lib(command[0])
+        except Exception as e:
+            full_path = abspath(command[0])
+
+        command[0] = full_path
+        # use the same python interpreter and actually running
+        command = [sys.executable] + command
+
+        # the visible devices tell us how many GPUs we want to use.
+        # when the trainer script was called the device has already been scoped by the time
+        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
+        # but forward the GPUs selected via environment variables
+        if self.parallel_device_ids is None:
+            raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")
+
+        os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids])
+        os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
+
+        if self.logger is not None:
+            os.environ["PL_EXP_VERSION"] = str(self.logger.version)
+
+        num_gpus = len(self.data_parallel_device_ids)
+        # TODO: Add num_nodes (pass it in?)
+        os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
+
+        self.interactive_ddp_procs = []
+
+        # TODO: Add num_processes (pass it in?)
+        for local_rank in range(1, self.num_processes):
+            env_copy = os.environ.copy()
+            env_copy["LOCAL_RANK"] = f"{local_rank}"
+
+            # remove env var if global seed not set
+            if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
+                del env_copy["PL_GLOBAL_SEED"]
+
+            # start process
+            # if hydra is available and initialized, make sure to set the cwd correctly
+            cwd: Optional[str] = None
+            if HYDRA_AVAILABLE:
+                if HydraConfig.initialized():
+                    cwd = get_original_cwd()
+            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
+            self.interactive_ddp_procs.append(proc)
+
+            # starting all processes at once can cause issues
+            # with dataloaders delay between 1-10 seconds
+            delay = np.random.uniform(1, 5, 1)[0]
+            sleep(delay)
+
+    def _check_can_spawn_children(self):
+        if self._has_spawned_children:
+            raise RuntimeError(
+                "You tried to run `.fit` or `.test` multiple times in the same script."
+                " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead."
+            )
+
+    def set_world_ranks(self):
+        self.local_rank = self.task_idx
+        self.global_rank = self.node_rank * self.num_processes + self.task_idx
+        self.world_size = self.num_nodes * self.num_processes
 
-class DistributedDataParallelPlugin(ParallelPlugin):
+    def configure_ddp(self):
+        # if unset, default `find_unused_parameters` `True`
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
+        self.model = LightningDistributedDataParallel(
+            self.model,
+            device_ids=self.determine_ddp_device_ids(),
+            **self._ddp_kwargs,
+        )
+
+    def determine_ddp_device_ids(self):
+        return [self.root_device]
+
+    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
+        # TODO: From where to get cluster environment?
+        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
+        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
+        torch_backend = "nccl" if self.on_gpu else "gloo"
+
+        if not torch.distributed.is_initialized():
+            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
+            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
+
+    def pre_training(self):
+        seed = os.environ.get("PL_GLOBAL_SEED")
+        if seed is not None:
+            seed_everything(int(seed))
+
+        # show progressbar only on progress_rank 0
+        # TODO: check where to move this. Cannot stay here, since we won't have access to progressbar here
+        if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None:
+            self.trainer.progress_bar_callback.disable()
+
+        # determine which process we are and world size
+        self.set_world_ranks()
+
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
+
+        # TODO: This has to be done somewhere else!
+        self.model.trainer = self.trainer
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
+
+        # on world_size=0 let everyone know training is starting
+        if self.is_global_zero and not torch.distributed.is_initialized():
+            log.info("-" * 100)
+            log.info(f"distributed_backend={self.distributed_backend}")
+            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
+            log.info("-" * 100)
+
+        self.model = self.configure_sync_batchnorm(self.model)
+
+        # move the model to the correct device
+        self.model_to_device()
+
+        self.configure_ddp()
+
+        self.barrier()
+
+    def post_training(self):
+        torch.cuda.empty_cache()
+
+        if "WORLD_SIZE" in os.environ:
+            del os.environ["WORLD_SIZE"]
+
+    @staticmethod
+    def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
+        """
+        Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override to synchronize batchnorm between specific process groups instead
+        of the whole world or use a different sync_bn like `apex`'s version.
+
+        Args:
+            model: pointer to current :class:`LightningModule`.
+
+        Return:
+            LightningModule with batchnorm layers synchronized between process groups
+        """
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
+
+        return model
+
+    def barrier(self):
+        if torch_distrib.is_initialized():
+            torch_distrib.barrier()
+
+    def model_to_device(self):
+        # TODO: Can we easily make this a property that falls back here?
+        # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
+        torch.cuda.set_device(self.root_device)
+        self.model.cuda(self.root_device)
+
+    def reduce(self, output, group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None):
+
+        if isinstance(output, torch.Tensor):
+            output = sync_ddp_if_available(output, group, reduce_op)
+        
+        return output
+
+
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class MidDistributedDataParallelPlugin(ParallelPlugin):
     def __init__(self, parallel_device_ids, num_nodes, num_processes, **ddp_kwargs):
-        super().__init__(self)
+        super().__init__(parallel_device_ids)
 
         self.task_idx = None
         self._has_spawned_children = False
         self.interactive_ddp_procs = []
         self.dist = LightningDistributed()
-        self.parallel_device_ids = parallel_device_ids
         self.num_nodes = num_nodes
         self.num_processes = num_processes
         self._ddp_kwargs: Dict[str, Any] = ddp_kwargs
@@ -128,15 +446,14 @@ def _call_children_scripts(self):
         # when the trainer script was called the device has already been scoped by the time
         # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
         # but forward the GPUs selected via environment variables
-        if self.trainer.data_parallel_device_ids is None:
+        if self.parallel_device_ids is None:
             raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")
 
         os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids])
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
 
-        # TODO: Change t
-        if self.trainer.logger is not None:
-            os.environ["PL_EXP_VERSION"] = str(self.trainer.logger.version)
+        if self.logger is not None:
+            os.environ["PL_EXP_VERSION"] = str(self.logger.version)
 
         num_gpus = len(self.parallel_device_ids)
         os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
@@ -186,6 +503,7 @@ def ddp_train(self, process_idx, model):
         if seed is not None:
             seed_everything(int(seed))
 
+        # TODO: move this somewhere else!
         # show progressbar only on progress_rank 0
         if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
             self.trainer.progress_bar_callback.disable()
@@ -201,9 +519,7 @@ def ddp_train(self, process_idx, model):
         # where to store ip_table
         model.trainer = self.trainer
         self.init_ddp_connection(
-            self.trainer.global_rank,
-            self.trainer.world_size,
-            self.trainer.is_slurm_managing_tasks
+            self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks
         )
 
         # call setup after the ddp process has connected
@@ -211,10 +527,10 @@ def ddp_train(self, process_idx, model):
 
         # on world_size=0 let everyone know training is starting
         if self.trainer.is_global_zero and not torch.distributed.is_initialized():
-            log.info('-' * 100)
-            log.info(f'distributed_backend={self.trainer.distributed_backend}')
-            log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
-            log.info('-' * 100)
+            log.info("-" * 100)
+            log.info(f"distributed_backend={self.trainer.distributed_backend}")
+            log.info(f"All DDP processes registered. Starting ddp with {self.trainer.world_size} processes")
+            log.info("-" * 100)
 
         # call sync_bn before .cuda(), configure_apex and configure_ddp
         if self.trainer.sync_batchnorm:
@@ -240,7 +556,7 @@ def ddp_train(self, process_idx, model):
         model = self.configure_ddp(model, device_ids)
 
         # set up training routine
-        self.barrier('ddp_setup')
+        self.barrier("ddp_setup")
         self.trainer.train_loop.setup_training(model)
 
         # train or test
@@ -255,15 +571,13 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi
         os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
         os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
-        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
+        torch_backend = "nccl" if self.on_gpu else "gloo"
 
         if not torch.distributed.is_initialized():
             log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
             torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
 
-    def configure_ddp(
-        self, model: LightningModule, device_ids: List[int]
-    ) -> LightningDistributedDataParallel:
+    def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel:
         """
         Pass through all customizations from constructor to `LightningDistributedDataParallel`.
         Override to define a custom DDP implementation.
@@ -288,9 +602,7 @@ def configure_ddp(self, model, device_ids):
 
         """
         # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get(
-            "find_unused_parameters", True
-        )
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
         model = LightningDistributedDataParallel(
             model,
             device_ids=device_ids,
@@ -315,11 +627,8 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
 
         return model
 
-    def sync_tensor(self,
-                    tensor: Union[torch.Tensor],
-                    group: Optional[Any] = None,
-                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
-        """
-
-        """
+    def sync_tensor(
+        self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
+    ) -> torch.Tensor:
+        """"""
         return sync_ddp_if_available(tensor, group, reduce_op)

From bcfb4e7cb723ddc3e1dbdce14bff086a4e95d0de Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 11 Nov 2020 16:59:06 +0100
Subject: [PATCH 009/157] minor changes precision plugin

---
 pytorch_lightning/accelerators/precision.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
index 19a375272e95f..0b53e3addbbd7 100644
--- a/pytorch_lightning/accelerators/precision.py
+++ b/pytorch_lightning/accelerators/precision.py
@@ -1,3 +1,4 @@
+from pytorch_lightning.accelerators.base_plugin import Plugin
 from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties
 from pytorch_lightning.core.lightning import LightningModule
 from typing import List, Tuple
@@ -13,7 +14,7 @@
     amp = None
 
 
-class PrecisionPlugin(object):
+class PrecisionPlugin(Plugin):
     EPSILON = 1e-6
     precision = 32
 

From bf8a87a659d5b8218bba872e188caedf2c013a21 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 11 Nov 2020 16:59:30 +0100
Subject: [PATCH 010/157] start ddp plugin

---
 .../accelerators/data_parallel.py             | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index e506041384ad3..50c27a1722ac4 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -375,6 +375,25 @@ def reduce(self, output, group: Optional[Any] = None,
         
         return output
 
+class DDPSpawnPlugin(ParallelPlugin):
+    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None):
+        super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
+
+        self.dist = LightningDistributed()
+        # TODO: how to get in nprocs? probably pass it
+        self.nprocs = nprocs
+        self.mp_queue = None
+
+    def setup(self, model):
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
+
+        # pass in a state q
+        smp = mp.get_context('spawn')
+        self.mp_queue = smp.SimpleQueue()
+
+    def pre_training(self, process_idx = None, mp_queue=None, ):
+        # TODO: use a mixture of os.fork and multiprocesing queue for ddp here
+        os.fork()
 
     
From 8482c0b68976817ce3562bcb52fc49da673548f6 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Thu, 12 Nov 2020 17:26:38 +0100
Subject: [PATCH 011/157] initail version ddp spawn

---
 pytorch_lightning/accelerators/base_plugin.py |   4 +-
 .../accelerators/data_parallel.py             | 151 +++++++++++++++++-
 2 files changed, 146 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py
index acd90e41f60df..1fdae7270fe47 100644
--- a/pytorch_lightning/accelerators/base_plugin.py
+++ b/pytorch_lightning/accelerators/base_plugin.py
@@ -5,7 +5,7 @@ class Plugin(object):
 
     def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
         return model, optimizers, lr_schedulers
-        
+
     def pre_optimizer_step(self, optimizer, optiizer_idx):
         pass
 
@@ -15,7 +15,7 @@ def post_optimizer_step(self, optimizer, optimizer_idx):
     def pre_training(self):
         pass
 
-    def post_training(self):
+    def post_training(self, results, best_model_path):
         pass
 
     @contextlib.contextmanager
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 50c27a1722ac4..0ef2987804450 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from os import stat
+import re
+from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
 from pytorch_lightning.accelerators.base_plugin import Plugin
 
 from torch.nn.parallel.distributed import DistributedDataParallel
@@ -24,8 +26,7 @@
 from pytorch_lightning import _logger as log
 import contextlib
 import torch.multiprocessing as mp
-
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn
 
 try:
     from hydra.utils import to_absolute_path, get_original_cwd
@@ -267,6 +268,7 @@ def _check_can_spawn_children(self):
 
     def set_world_ranks(self):
         self.local_rank = self.task_idx
+        # TODO: check from where we get node_rank and num_processes
         self.global_rank = self.node_rank * self.num_processes + self.task_idx
         self.world_size = self.num_nodes * self.num_processes
 
@@ -315,8 +317,12 @@ def pre_training(self):
         # set up server using proc 0's ip address
         # try to init for 20 times at max in case ports are taken
         # where to store ip_table
+        # TODO: CHeck is_slurm_managing_tasks
         self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
 
+        # TODO: Move this somewhere else
+        self.trainer.call_setup_hook(self.model)
+
         # on world_size=0 let everyone know training is starting
         if self.is_global_zero and not torch.distributed.is_initialized():
             log.info("-" * 100)
@@ -329,11 +335,15 @@ def pre_training(self):
         # move the model to the correct device
         self.model_to_device()
 
+        # TODO: Check where this can be moved
+        # set model properties before going into wrapper
+        self.trainer.model_connector.copy_trainer_model_properties(self.model)
+
         self.configure_ddp()
 
         self.barrier()
 
-    def post_training(self):
+    def post_training(self, results, best_model_path):
         torch.cuda.empty_cache()
 
         if "WORLD_SIZE" in os.environ:
@@ -375,14 +385,17 @@ def reduce(self, output, group: Optional[Any] = None,
         
         return output
 
+
 class DDPSpawnPlugin(ParallelPlugin):
-    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None):
+    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0):
         super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
+        self.process_idx = None
 
         self.dist = LightningDistributed()
         # TODO: how to get in nprocs? probably pass it
         self.nprocs = nprocs
         self.mp_queue = None
+        self.proc_offset = proc_offset
 
     def setup(self, model):
         os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
@@ -391,13 +404,137 @@ def setup(self, model):
         smp = mp.get_context('spawn')
         self.mp_queue = smp.SimpleQueue()
 
-    def pre_training(self, process_idx = None, mp_queue=None, ):
-        # TODO: use a mixture of os.fork and multiprocesing queue for ddp here
-        os.fork()
+    def set_world_ranks(self):
+        self.local_rank = self.process_idx
+        # check from where we get node_rank, num_processes and num_nodes
+        self.global_rank = self.node_rank * self.num_processes + self.self.process_idx
+        self.world_size = self.num_nodes * self.num_processes
+
+    def pre_training(self):
+
+        # TODO: Check if current process can be used as one training proc
+        # start from one since current process is proc 0
+        for proc_idx in range(1, self.nprocs):
+            # use os.fork, since this enables us to continue from here 
+            # instead of spawning with separate function
+            pid = os.fork()
+
+            # set in child processes (PID=0). All previous child processes 
+            # should already have their process_idx assigned
+            if pid == 0 and self.process_idx is None:
+                self.process_idx = proc_idx + self.proc_offset
+
+        # set process idx for current process
+        if pid != 0:
+            self.process_idx = 0 + self.proc_offset
+
+        # TODO: Check where to put that since we don't have access to the pbar here
+        # show progressbar only on progress_rank 0
+        if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
+            self.trainer.progress_bar_callback.disable()
+
+        self.set_world_ranks()
 
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
     
+        # TODO: This has to be done somewhere else!
+        self.model.trainer = self.trainer
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        # TODO: CHeck is_slurm_managing_tasks
+        self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
+
+        # TODO: Move this somewhere else
+        self.trainer.call_setup_hook(self.model)
+
+        # on world_size=0 let everyone know training is starting
+        if self.is_global_zero and not torch.distributed.is_initialized():
+            log.info("-" * 100)
+            log.info(f"distributed_backend={self.distributed_backend}")
+            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
+            log.info("-" * 100)
+
+        self.model = self.configure_sync_batchnorm(self.model)
+
+        # move the model to the correct device
+        self.model_to_device()
+
+        # TODO: Check where this can be moved
+        # set model properties before going into wrapper
+        self.trainer.model_connector.copy_trainer_model_properties(self.model)
+
+        self.configure_ddp()
+
+        self.barrier()
+
+    def post_training(self, results, best_model_path):
+        # get original model
+        # TODO: How To get this? is this simply self.model?
+        model = self.trainer.get_model()
+
+        # persist info in ddp_spawn
+        self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path)
 
+        # clean up memory
+        torch.cuda.empty_cache()
+
+        if self.process_idx == 0:
+            # restore main state with best weights
+            best_path = self.mp_queue.get()
+            results = self.mp_queue.get()
+            last_path = self.mp_queue.get()
+
+            # recover the weights of the processes trained in the children
+            self.__recover_child_process_weights(model, best_path, last_path)
+
+    def configure_ddp(self):
+        # if unset, default `find_unused_parameters` `True`
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
+        self.model = LightningDistributedDataParallel(
+            self.model,
+            device_ids=self.determine_ddp_device_ids(),
+            **self._ddp_kwargs,
+        )
+
+    def determine_ddp_device_ids(self):
+        return [self.root_device]
 
+    def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None):
+
+        if self.global_rank == 0 and self.mp_queue is not None:
+            rank_zero_warn('cleaning up ddp environment...')
+            # todo, pass complete checkpoint as state dictionary
+            self.mp_queue.put(best_model_path)
+            self.mp_queue.put(results)
+
+            # save the last weights
+            last_path = None
+            # TODO: From where to get self.trainer.testing?
+            if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+                last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
+                atomic_save(self.model.state_dict(), last_path)
+            self.mp_queue.put(last_path)
+
+
+    def __recover_child_process_weights(self, model, best_path, last_path):
+        # TODO: Where can we set this?
+        # transfer back the best path to the trainer
+        if self.trainer.checkpoint_callback:
+            self.trainer.checkpoint_callback.best_model_path = best_path
+        # todo, pass also best score
+
+        # load last weights
+        # TODO: How to get self.trainer.testing?
+        if last_path is not None and not self.trainer.testing:
+            ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt)
+
+        # TODO: Where to set this?
+        # Do we really need to set this or can we just make the trainer property forward our current property here?
+        self.trainer.model = model
 
 
From 12d2c59dc3e5110ed5caf840aa3200550ab70724 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Thu, 12 Nov 2020 17:27:31 +0100
Subject: [PATCH 012/157] remove deprecated implementation

---
 .../accelerators/data_parallel.py             | 252 ------------------
 1 file changed, 252 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 0ef2987804450..fc5c2958f1af1 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -536,255 +536,3 @@ def __recover_child_process_weights(self, model, best_path, last_path):
         # Do we really need to set this or can we just make the trainer property forward our current property here?
         self.trainer.model = model
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-class MidDistributedDataParallelPlugin(ParallelPlugin):
-    def __init__(self, parallel_device_ids, num_nodes, num_processes, **ddp_kwargs):
-        super().__init__(parallel_device_ids)
-
-        self.task_idx = None
-        self._has_spawned_children = False
-        self.interactive_ddp_procs = []
-        self.dist = LightningDistributed()
-        self.num_nodes = num_nodes
-        self.num_processes = num_processes
-        self._ddp_kwargs: Dict[str, Any] = ddp_kwargs
-
-    def setup(self, model):
-        # start the other scripts
-        if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
-            self._call_children_scripts()
-
-        # set the task idx
-        self.task_idx = int(os.environ["LOCAL_RANK"])
-
-    def _call_children_scripts(self):
-        assert self.trainer.global_rank == 0
-        self._check_can_spawn_children()
-        self._has_spawned_children = True
-
-        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
-        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
-
-        # allow the user to pass the node rank
-        node_rank = "0"
-        node_rank = os.environ.get("NODE_RANK", node_rank)
-        node_rank = os.environ.get("GROUP_RANK", node_rank)
-        os.environ["NODE_RANK"] = node_rank
-        os.environ["LOCAL_RANK"] = "0"
-
-        # when user is using hydra find the absolute path
-        path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path
-
-        # pull out the commands used to run the script and resolve the abs file path
-        command = sys.argv
-        try:
-            full_path = path_lib(command[0])
-        except Exception as e:
-            full_path = abspath(command[0])
-
-        command[0] = full_path
-        # use the same python interpreter and actually running
-        command = [sys.executable] + command
-
-        # the visible devices tell us how many GPUs we want to use.
-        # when the trainer script was called the device has already been scoped by the time
-        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
-        # but forward the GPUs selected via environment variables
-        if self.parallel_device_ids is None:
-            raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")
-
-        os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids])
-        os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
-
-        if self.logger is not None:
-            os.environ["PL_EXP_VERSION"] = str(self.logger.version)
-
-        num_gpus = len(self.parallel_device_ids)
-        os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
-
-        self.interactive_ddp_procs = []
-        for local_rank in range(1, self.num_processes):
-            env_copy = os.environ.copy()
-            env_copy["LOCAL_RANK"] = f"{local_rank}"
-
-            # remove env var if global seed not set
-            if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
-                del env_copy["PL_GLOBAL_SEED"]
-
-            # start process
-            # if hydra is available and initialized, make sure to set the cwd correctly
-            cwd: Optional[str] = None
-            if HYDRA_AVAILABLE:
-                if HydraConfig.initialized():
-                    cwd = get_original_cwd()
-            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
-            self.interactive_ddp_procs.append(proc)
-
-            # starting all processes at once can cause issues
-            # with dataloaders delay between 1-10 seconds
-            delay = np.random.uniform(1, 5, 1)[0]
-            sleep(delay)
-
-    def barrier(self, name: Optional[str] = None):
-        if torch_distrib.is_initialized():
-            torch_distrib.barrier()
-
-    # TODO: Refactor This! Not sure we still need the whole method here. Should be dione with some additional setup and cleaning logic
-    def ddp_train(self, process_idx, model):
-        """
-        Entry point for ddp
-
-        Args:
-            process_idx:
-            mp_queue: multiprocessing queue
-            model:
-
-        Returns:
-            Dict with evaluation results
-
-        """
-        seed = os.environ.get("PL_GLOBAL_SEED")
-        if seed is not None:
-            seed_everything(int(seed))
-
-        # TODO: move this somewhere else!
-        # show progressbar only on progress_rank 0
-        if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
-            self.trainer.progress_bar_callback.disable()
-
-        # determine which process we are and world size
-        self.set_world_ranks(process_idx)
-
-        # set warning rank
-        rank_zero_only.rank = self.trainer.global_rank
-
-        # set up server using proc 0's ip address
-        # try to init for 20 times at max in case ports are taken
-        # where to store ip_table
-        model.trainer = self.trainer
-        self.init_ddp_connection(
-            self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks
-        )
-
-        # call setup after the ddp process has connected
-        self.trainer.call_setup_hook(model)
-
-        # on world_size=0 let everyone know training is starting
-        if self.trainer.is_global_zero and not torch.distributed.is_initialized():
-            log.info("-" * 100)
-            log.info(f"distributed_backend={self.trainer.distributed_backend}")
-            log.info(f"All DDP processes registered. Starting ddp with {self.trainer.world_size} processes")
-            log.info("-" * 100)
-
-        # call sync_bn before .cuda(), configure_apex and configure_ddp
-        if self.trainer.sync_batchnorm:
-            model = self.configure_sync_batchnorm(model)
-
-        # move the model to the correct device
-        self.model_to_device(model, process_idx)
-
-        # CHOOSE OPTIMIZER
-        # allow for lr schedulers as well
-        self.setup_optimizers(model)
-
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
-        # 16-bit
-        model = self.trainer.precision_connector.connect(model)
-
-        # device ids change depending on the DDP setup
-        device_ids = self.get_device_ids()
-
-        # allow user to configure ddp
-        model = self.configure_ddp(model, device_ids)
-
-        # set up training routine
-        self.barrier("ddp_setup")
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-
-        # clean up memory
-        torch.cuda.empty_cache()
-
-        return results
-
-    def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None:
-        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
-        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
-        torch_backend = "nccl" if self.on_gpu else "gloo"
-
-        if not torch.distributed.is_initialized():
-            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
-
-    def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel:
-        """
-        Pass through all customizations from constructor to `LightningDistributedDataParallel`.
-        Override to define a custom DDP implementation.
-
-        .. note:: Only requirement is that your DDP implementation subclasses LightningDistributedDataParallel
-
-
-        The default implementation is::
-
-            def configure_ddp(self, model, device_ids):
-                model = LightningDistributedDataParallel(
-                    model, device_ids=device_ids, find_unused_parameters=True
-                )
-                return model
-
-        Args:
-            model: the lightningModule
-            device_ids: the list of devices available
-
-        Returns:
-            the model wrapped in LightningDistributedDataParallel
-
-        """
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        model = LightningDistributedDataParallel(
-            model,
-            device_ids=device_ids,
-            **self._ddp_kwargs,
-        )
-        return model
-
-    def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
-        """
-        Add global batchnorm for a model spread across multiple GPUs and nodes.
-
-        Override to synchronize batchnorm between specific process groups instead
-        of the whole world or use a different sync_bn like `apex`'s version.
-
-        Args:
-            model: pointer to current :class:`LightningModule`.
-
-        Return:
-            LightningModule with batchnorm layers synchronized between process groups
-        """
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
-
-        return model
-
-    def sync_tensor(
-        self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
-    ) -> torch.Tensor:
-        """"""
-        return sync_ddp_if_available(tensor, group, reduce_op)

From 8d83db883f7316df4f5fc4a339809ac1751fa0b1 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Thu, 12 Nov 2020 17:28:21 +0100
Subject: [PATCH 013/157] add comment on whats missing

---
 pytorch_lightning/accelerators/data_parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index fc5c2958f1af1..2c7f9ae4c5924 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -536,3 +536,4 @@ def __recover_child_process_weights(self, model, best_path, last_path):
         # Do we really need to set this or can we just make the trainer property forward our current property here?
         self.trainer.model = model
 
+# STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP
\ No newline at end of file

From 22e1e31ef84e5991d536711bbb5bc7e9779375f9 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Fri, 20 Nov 2020 11:16:09 +0100
Subject: [PATCH 014/157] latest state

---
 pytorch_lightning/accelerators/accelerator.py |  27 +-
 .../accelerators/accelerator_connector.py     | 249 ++++++++++++++++++
 pytorch_lightning/accelerators/base_plugin.py |   6 +-
 .../accelerators/data_parallel.py             | 111 ++++++--
 pytorch_lightning/accelerators/precision.py   |  16 +-
 5 files changed, 360 insertions(+), 49 deletions(-)
 create mode 100644 pytorch_lightning/accelerators/accelerator_connector.py

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index ccfc093fde5a5..21e0f191e384e 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,4 +1,4 @@
-from pytorch_lightning.accelerators.data_parallel import ParallelPlugin
+from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin
 from pytorch_lightning.accelerators.base_plugin import Plugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities import AMPType
@@ -15,30 +15,31 @@
 
 
 class NewAccelerator(object):
-    root_device: Union[str, torch.device]
 
     def __init__(
         self,
         model_ref: LightningModule,
-        root_device: Union[str, torch.device],
         precision_plugin: PrecisionPlugin,
-        parallel_plugin: ParallelPlugin,
+        training_type_plugin: TrainingTypePlugin,
         gradient_clip_val,
     ):
         self.model_ref = model_ref
         self.precision_plugin = precision_plugin
-        self.parallel_plugin = parallel_plugin
+        self.training_type_plugin = training_type_plugin
         self.gradient_clip_val = gradient_clip_val
 
         self.optimizers = None
         self.lr_schedulers = None
         self.optimizer_frequencies = None
-        self.root_device = root_device
 
     def setup(self, model):
+        self.connect_training_type_plugin()
         self.setup_optimizers(model)
-        self.connect_plugin(self.precision_plugin)
-        self.connect_plugin(self.parallel_plugin)
+        self.connect_precision_plugin()
+
+    @property
+    def root_device(self):
+        return self.training_type_plugin.root_device
 
     def teardown(self):
         pass
@@ -55,7 +56,7 @@ def training_step(self, args):
         args[0] = batch
 
         with self.precision_plugin.train_step_context():
-            with self.parallel_plugin.train_step_context():
+            with self.training_type_plugin.train_step_context():
                 return self.model_ref.training_step(*args)
 
     def validation_step(self, args):
@@ -64,7 +65,7 @@ def validation_step(self, args):
         args[0] = batch
 
         with self.precision_plugin.val_step_context():
-            with self.parallel_plugin.val_step_context():
+            with self.training_type_plugin.val_step_context():
                 return self.model_ref.validation_step(*args)
 
     def test_step(self, args):
@@ -73,7 +74,7 @@ def test_step(self, args):
         args[0] = batch
 
         with self.precision_plugin.test_step_context():
-            with self.parallel_plugin.test_step_context():
+            with self.training_type_plugin.test_step_context():
                 return self.model_ref.test_step(*args)
 
     def process_dataloader(self, dataloader):
@@ -170,9 +171,9 @@ def setup_optimizers(self, model):
         self.lr_schedulers = lr_schedulers
         self.optimizer_frequencies = optimizer_frequencies
 
-    def connect_plugin(self, plugin: Plugin):
+    def connect_training_type_plugin(self, plugin: Plugin):
         model, optimizers, schedulers = plugin.connect(
-            self.model_ref, self.optimizers, self.lr_schedulers
+            self.model_ref
         )
 
         self.model_ref = model
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
new file mode 100644
index 0000000000000..d9a111f355e68
--- /dev/null
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -0,0 +1,249 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pytorch_lightning import accelerators
+import os
+import torch
+
+from pytorch_lightning.utilities import device_parser
+from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning import _logger as log
+from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
+from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
+from pytorch_lightning.accelerators.accelerator import Accelerator
+
+try:
+    import torch_xla
+except ImportError:
+    XLA_AVAILABLE = False
+else:
+    XLA_AVAILABLE = True
+
+try:
+    import horovod.torch as hvd
+except (ModuleNotFoundError, ImportError):
+    HOROVOD_AVAILABLE = False
+else:
+    HOROVOD_AVAILABLE = True
+
+
+class BackendConnector(object):
+    def __init__(
+        self,
+        num_processes,
+        tpu_cores,
+        accelerator,
+        distributed_backend,
+        auto_select_gpus,
+        gpus,
+        num_nodes,
+        log_gpu_memory,
+        sync_batchnorm,
+        benchmark,
+        replace_sampler_ddp,
+        deterministic,
+    ):
+
+        # initialization
+        self.use_dp = False
+        self.use_ddp = False
+        self.use_ddp2 = False
+        self.use_horovod = False
+        self.use_single_gpu = False
+        self.num_gpus = None
+
+        self.num_processes = num_processes
+        self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
+        self.accelerator = accelerator
+        self.distributed_backend = distributed_backend
+        self.auto_select_gpus = auto_select_gpus
+        self.gpus = gpus
+        self.num_nodes = num_nodes
+        self.log_gpu_memory = log_gpu_memory
+        self.sync_batchnorm = sync_batchnorm
+        self.benchmark = benchmark
+        self.replace_sampler_ddp = replace_sampler_ddp
+        self.deterministic = deterministic
+
+        # init the default rank if exists
+        # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
+        # this way we only show it on rank 0
+        if 'LOCAL_RANK' in os.environ:
+            rank_zero_only.rank = int(os.environ['LOCAL_RANK'])
+
+        # TODO: Move autoselect GPUS to other place
+        # for gpus allow int, string and gpu list
+        # if auto_select_gpus and isinstance(gpus, int):
+        #     self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus)
+
+        self.parallel_devices = device_parser.parse_gpu_ids(self.gpus)
+        self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices)
+        self.root_device = torch.device("cpu")
+
+        self.set_distributed_mode()
+
+        # override dist backend when using tpus
+        if self.on_tpu:
+            self.distributed_backend = "tpu"
+            self.use_tpu = True
+
+        # init flags for SLURM+DDP to work
+        self.world_size = 1
+        self.interactive_ddp_procs = []
+
+        # link up SLURM
+        # TODO: this should be taken out of here... but depends too much on DDP
+        self.slurm_connector.on_trainer_init(self.num_nodes)
+        self.node_rank = self.determine_ddp_node_rank()
+        self.local_rank = self.determine_local_rank()
+        self.global_rank = 0
+
+        # NVIDIA setup
+        self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
+
+        self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE')
+
+        self.replace_sampler_ddp = replace_sampler_ddp
+
+    @property
+    def on_tpu(self):
+        return self.tpu_cores is not None
+
+    @property
+    def tpu_id(self):
+        if self.on_tpu:
+            return self.tpu_cores[0]
+
+        return None
+
+    @property
+    def on_gpu(self):
+        return self.parallel_devices and torch.cuda.is_available()
+
+    def set_distributed_mode(self):
+
+        # No distributed backend
+        if self.distributed_backend is None:
+            # horovod multi GPU
+            if self.has_horovodrun():
+                self._set_horovod_backend()
+
+            # DDP CPU
+            elif self.num_gpus == 0:
+                if self.num_nodes > 1 or self.num_processes > 1:
+                    self.use_ddp = True
+
+            # Single GPU
+            elif self.num_gpus == 1:
+                self.use_single_gpu = True
+
+            # Default: DDP-Spawn
+            elif self.num_gpus > 1:
+                rank_zero_warn(
+                    'You requested multiple GPUs but did not specify a backend, e.g.'
+                    ' (distributed_backend="dp"|"ddp"|"ddp2").'
+                    ' Setting distributed_backend="ddp_spawn" for you.'
+                )
+                self.distributed_backend = "ddp_spawn"
+
+        # DP
+        if self.distributed_backend == "dp":
+            # do nothing if num_gpus == 0
+            if self.num_gpus == 1:
+                self.use_single_gpu = True
+                self.use_dp = True
+            elif self.num_gpus > 1:
+                self.use_dp = True
+
+        # DDP, DDP-Spawn
+        elif self.distributed_backend in ("ddp", "ddp_spawn"):
+            if self.num_gpus == 0:
+                # DDP CPU
+                if self.num_nodes > 1 or self.num_processes > 1:
+                    self.use_ddp = True 
+            
+            # DDP Single GPU
+            elif self.num_gpus == 1:
+                self.use_single_gpu = True
+                self.use_ddp = True
+
+            # DDP Multi GPU
+            elif self.num_gpus > 1:
+                self.use_ddp = True
+                self.num_processes = self.num_gpus
+
+        # DDP2
+        elif self.distributed_backend == "ddp2":
+            # do nothing if num_gpus == 0
+            if self.num_gpus >= 1:
+                self.use_ddp2 = True
+
+        # DDP CPU
+        elif self.distributed_backend == "ddp_cpu":
+            if self.num_gpus > 0:
+                rank_zero_warn(
+                    'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
+                )
+            self.use_ddp = True
+            self.data_parallel_device_ids = None
+            self.on_gpu = False
+
+        # HOROVOD
+        elif self.distributed_backend == "horovod":
+            self._set_horovod_backend()
+
+        # throw error to force user ddp or ddp2 choice
+        if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp):
+            raise MisconfigurationException(
+                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
+                'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2'
+            )
+
+        rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
+        num_cores = self.tpu_cores if self.tpu_cores is not None else 0
+        rank_zero_info(f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores')
+
+        if torch.cuda.is_available() and not self.on_gpu:
+            rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.')
+
+    
+    def _set_horovod_backend(self):
+        self.check_horovod()
+        self.use_horovod = True
+
+        # Initialize Horovod to get rank / size info
+        hvd.init()
+        if self.on_gpu:
+            # Horovod assigns one local GPU per process
+            self.root_gpu = hvd.local_rank()
+
+    def check_horovod(self):
+        """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod."""
+        if not HOROVOD_AVAILABLE:
+            raise MisconfigurationException(
+                'Requested `distributed_backend="horovod"`, but Horovod is not installed.'
+                'Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]'
+            )
+
+        if self.num_gpus > 1 or self.num_nodes > 1:
+            raise MisconfigurationException(
+                'Horovod does not support setting num_nodes / num_gpus explicitly. Use '
+                'horovodrun / mpirun to configure the number of processes.'
+            )
+
+    @staticmethod
+    def has_horovodrun():
+        """Returns True if running with `horovodrun` using Gloo or OpenMPI."""
+        return 'OMPI_COMM_WORLD_RANK' in os.environ or 'HOROVOD_RANK' in os.environ
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py
index 1fdae7270fe47..401dc549c5327 100644
--- a/pytorch_lightning/accelerators/base_plugin.py
+++ b/pytorch_lightning/accelerators/base_plugin.py
@@ -3,10 +3,10 @@
 
 class Plugin(object):
 
-    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
-        return model, optimizers, lr_schedulers
+    def connect(self, model: torch.nn.Module, *args, **kwargs):
+        return model
 
-    def pre_optimizer_step(self, optimizer, optiizer_idx):
+    def pre_optimizer_step(self, optimizer, optimizer_idx):
         pass
 
     def post_optimizer_step(self, optimizer, optimizer_idx):
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 2c7f9ae4c5924..62a8710034af1 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -1,11 +1,8 @@
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
-from os import stat
 import re
 from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
 from pytorch_lightning.accelerators.base_plugin import Plugin
 
-from torch.nn.parallel.distributed import DistributedDataParallel
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities.seed import seed_everything
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -15,7 +12,6 @@
 from pytorch_lightning.core.step_result import Result
 from typing import Any, Dict, List, Optional, Union
 from pytorch_lightning.overrides.data_parallel import LightningDataParallel, LightningDistributedDataParallel
-from torch.nn.parallel.data_parallel import DataParallel
 import sys
 from os.path import abspath
 from time import sleep
@@ -26,7 +22,7 @@
 from pytorch_lightning import _logger as log
 import contextlib
 import torch.multiprocessing as mp
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info
 
 try:
     from hydra.utils import to_absolute_path, get_original_cwd
@@ -73,6 +69,37 @@ def is_global_zero(self):
     def barrier(self):
         raise NotImplementedError
 
+    def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids):
+        if device_ids is None:
+            return
+
+        # set the correct cuda visible devices (using pci order)
+        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
+        devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
+        log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')
+
+
+    def determine_local_rank(self):
+        return int(os.environ.get('LOCAL_RANK', 0))
+        
+
+    def determine_node_rank(self):
+
+        # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK.
+        # otherwise use given node rank or default to node rank 0
+        env_vars = ['NODE_RANK', 'GROUP_RANK']
+        node_ids = [(k, os.environ.get(k, None)) for k in env_vars]
+        node_ids = [(k, v) for k, v in node_ids if v is not None]
+        if len(node_ids) == 0:
+            return 0
+        if len(node_ids) > 1:
+            log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.")
+        k, rank = node_ids.pop()
+        rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
+        return int(rank)
+
+
 class SingleDevicePlugin(TrainingTypePlugin):
     def __init__(self, device, logger=None):
         super().__init__(logger=logger)
@@ -90,10 +117,16 @@ def root_device(self):
         return self.device
     
     def model_to_device(self):
+        if self.on_gpu:
+            torch.cuda.set_device(self.root_device)
+
         self.model.to(self.root_device)
 
-    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
+    def connect(self, model: torch.nn.Module):
         self.model = model
+        self.model_to_device()
+
+        return self.model
 
     @property
     def is_global_zero(self):
@@ -174,6 +207,18 @@ def __init__(self, parallel_device_ids, logger=None, cluster_environment=None) -
     def root_device(self):
         return self.parallel_device_ids[self.local_rank]
 
+    def determine_local_rank(self):
+        if self.is_slurm_managing_tasks:
+            return int(os.environ['SLURM_LOCALID'])
+        else:
+            return super().determine_node_rank()
+
+    def determine_node_rank(self):
+        if self.is_slurm_managing_tasks:
+            return int(os.environ['SLURM_NODEID'])
+        else:
+            return super().determine_node_rank()
+
     def setup(self, model):
 
         self.model = model
@@ -269,7 +314,7 @@ def _check_can_spawn_children(self):
     def set_world_ranks(self):
         self.local_rank = self.task_idx
         # TODO: check from where we get node_rank and num_processes
-        self.global_rank = self.node_rank * self.num_processes + self.task_idx
+        self.global_rank = self.determine_node_rank() * self.num_processes + self.task_idx
         self.world_size = self.num_nodes * self.num_processes
 
     def configure_ddp(self):
@@ -302,8 +347,8 @@ def pre_training(self):
 
         # show progressbar only on progress_rank 0
         # TODO: check where to move this. Cannot stay here, since we won't have access to progressbar here
-        if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None:
-            self.trainer.progress_bar_callback.disable()
+        # if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None:
+        #     self.trainer.progress_bar_callback.disable()
 
         # determine which process we are and world size
         self.set_world_ranks()
@@ -312,7 +357,7 @@ def pre_training(self):
         rank_zero_only.rank = self.global_rank
 
         # TODO: This has to be done somewhere else!
-        self.model.trainer = self.trainer
+        # self.model.trainer = self.trainer
 
         # set up server using proc 0's ip address
         # try to init for 20 times at max in case ports are taken
@@ -321,7 +366,7 @@ def pre_training(self):
         self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
 
         # TODO: Move this somewhere else
-        self.trainer.call_setup_hook(self.model)
+        # self.trainer.call_setup_hook(self.model)
 
         # on world_size=0 let everyone know training is starting
         if self.is_global_zero and not torch.distributed.is_initialized():
@@ -337,7 +382,7 @@ def pre_training(self):
 
         # TODO: Check where this can be moved
         # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(self.model)
+        # self.trainer.model_connector.copy_trainer_model_properties(self.model)
 
         self.configure_ddp()
 
@@ -393,7 +438,7 @@ def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, p
 
         self.dist = LightningDistributed()
         # TODO: how to get in nprocs? probably pass it
-        self.nprocs = nprocs
+        self.num_processes = num_processes
         self.mp_queue = None
         self.proc_offset = proc_offset
 
@@ -407,14 +452,14 @@ def setup(self, model):
     def set_world_ranks(self):
         self.local_rank = self.process_idx
         # check from where we get node_rank, num_processes and num_nodes
-        self.global_rank = self.node_rank * self.num_processes + self.self.process_idx
+        self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx
         self.world_size = self.num_nodes * self.num_processes
 
     def pre_training(self):
 
         # TODO: Check if current process can be used as one training proc
         # start from one since current process is proc 0
-        for proc_idx in range(1, self.nprocs):
+        for proc_idx in range(1, self.num_processes):
             # use os.fork, since this enables us to continue from here 
             # instead of spawning with separate function
             pid = os.fork()
@@ -430,8 +475,8 @@ def pre_training(self):
 
         # TODO: Check where to put that since we don't have access to the pbar here
         # show progressbar only on progress_rank 0
-        if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
-            self.trainer.progress_bar_callback.disable()
+        # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
+        #     self.trainer.progress_bar_callback.disable()
 
         self.set_world_ranks()
 
@@ -439,7 +484,7 @@ def pre_training(self):
         rank_zero_only.rank = self.global_rank
     
         # TODO: This has to be done somewhere else!
-        self.model.trainer = self.trainer
+        # self.model.trainer = self.trainer
 
         # set up server using proc 0's ip address
         # try to init for 20 times at max in case ports are taken
@@ -448,7 +493,7 @@ def pre_training(self):
         self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
 
         # TODO: Move this somewhere else
-        self.trainer.call_setup_hook(self.model)
+        # self.trainer.call_setup_hook(self.model)
 
         # on world_size=0 let everyone know training is starting
         if self.is_global_zero and not torch.distributed.is_initialized():
@@ -464,7 +509,7 @@ def pre_training(self):
 
         # TODO: Check where this can be moved
         # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(self.model)
+        # self.trainer.model_connector.copy_trainer_model_properties(self.model)
 
         self.configure_ddp()
 
@@ -473,7 +518,8 @@ def pre_training(self):
     def post_training(self, results, best_model_path):
         # get original model
         # TODO: How To get this? is this simply self.model?
-        model = self.trainer.get_model()
+        # model = self.trainer.get_model()
+        model = self.model
 
         # persist info in ddp_spawn
         self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path)
@@ -513,7 +559,8 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_pat
             # save the last weights
             last_path = None
             # TODO: From where to get self.trainer.testing?
-            if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+            # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+            if best_model_path is not None and len(best_model_path) > 0:
                 last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
                 atomic_save(self.model.state_dict(), last_path)
             self.mp_queue.put(last_path)
@@ -522,18 +569,30 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_pat
     def __recover_child_process_weights(self, model, best_path, last_path):
         # TODO: Where can we set this?
         # transfer back the best path to the trainer
-        if self.trainer.checkpoint_callback:
-            self.trainer.checkpoint_callback.best_model_path = best_path
+        # if self.trainer.checkpoint_callback:
+        #     self.trainer.checkpoint_callback.best_model_path = best_path
         # todo, pass also best score
 
         # load last weights
         # TODO: How to get self.trainer.testing?
-        if last_path is not None and not self.trainer.testing:
+        if last_path is not None: # and not self.trainer.testing:
             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt)
 
         # TODO: Where to set this?
         # Do we really need to set this or can we just make the trainer property forward our current property here?
-        self.trainer.model = model
+        # self.trainer.model = model
+
+    def determine_local_rank(self):
+        if self.is_slurm_managing_tasks:
+            return int(os.environ['SLURM_LOCALID'])
+        else:
+            return super().determine_node_rank()
+
+    def determine_node_rank(self):
+        if self.is_slurm_managing_tasks:
+            return int(os.environ['SLURM_NODEID'])
+        else:
+            return super().determine_node_rank()
 
 # STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
index 0b53e3addbbd7..ca41e8242f104 100644
--- a/pytorch_lightning/accelerators/precision.py
+++ b/pytorch_lightning/accelerators/precision.py
@@ -1,3 +1,4 @@
+from contextlib import contextmanager
 from pytorch_lightning.accelerators.base_plugin import Plugin
 from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties
 from pytorch_lightning.core.lightning import LightningModule
@@ -18,7 +19,7 @@ class PrecisionPlugin(Plugin):
     EPSILON = 1e-6
     precision = 32
 
-    def pre_optimizer_step(self, optimizer, optiizer_idx):
+    def pre_optimizer_step(self, optimizer, optimizer_idx):
         pass
 
     def post_optimizer_step(self, optimizer, optimizer_idx):
@@ -77,20 +78,21 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
 
         return closure_loss
 
+    @contextmanager
+    def train_step_context(self):
+        yield torch.cuda.amp.autocast()
+
 
 class ApexMixedPrecisionPlugin(MixedPrecisionPlugin):
-    def __init__(self):
+    def __init__(self, amp_level):
         self.backend = AMPType.APEX
+        self.amp_level = amp_level
 
     def connect(self, model, optimizers, lr_schedulers):
-        model, optimizers = self.configure_apex(amp, model, optimizers, self.trainer.amp_level)
+        model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level)
         reinit_scheduler_properties(optimizers, lr_schedulers)
         return model, optimizers, lr_schedulers
 
-    def training_step(self, fx, args):
-        output = fx(args)
-        return output
-
     def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         closure_loss = amp.scale_loss(closure_loss, optimizer)
 

From eac87c38d04f6968108a5ec3df77721c4743be21 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 30 Nov 2020 17:10:04 +0100
Subject: [PATCH 015/157] update accelerator for model to live in traintype
 plugin

---
 pytorch_lightning/accelerators/accelerator.py | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 21e0f191e384e..9d84c2cbadc49 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -18,12 +18,10 @@ class NewAccelerator(object):
 
     def __init__(
         self,
-        model_ref: LightningModule,
         precision_plugin: PrecisionPlugin,
         training_type_plugin: TrainingTypePlugin,
         gradient_clip_val,
     ):
-        self.model_ref = model_ref
         self.precision_plugin = precision_plugin
         self.training_type_plugin = training_type_plugin
         self.gradient_clip_val = gradient_clip_val
@@ -37,6 +35,18 @@ def setup(self, model):
         self.setup_optimizers(model)
         self.connect_precision_plugin()
 
+    @property
+    def model(self):
+        return self.training_type_plugin.model
+
+    @model.setter
+    def model(self, new_model):
+        self.training_type_plugin.model = new_model
+
+    @property
+    def lightning_module(self):
+        return self.training_type_plugin.lightning_module
+
     @property
     def root_device(self):
         return self.training_type_plugin.root_device
@@ -84,6 +94,8 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         return self.precision_plugin.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
 
     def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
+        # TODO: Check out if this can be simplified with new LightningOptimizer!
+
         model_ref = self.model_ref
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
         native_amp = self.trainer.amp_backend == AMPType.NATIVE
@@ -171,12 +183,15 @@ def setup_optimizers(self, model):
         self.lr_schedulers = lr_schedulers
         self.optimizer_frequencies = optimizer_frequencies
 
-    def connect_training_type_plugin(self, plugin: Plugin):
-        model, optimizers, schedulers = plugin.connect(
+    def connect_training_type_plugin(self, plugin: TrainingTypePlugin):
+        plugin.connect(
             self.model_ref
         )
 
-        self.model_ref = model
+    def connect_precision_plugin(self, plugin: PrecisionPlugin):
+        model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers)
+
+        self.model = model
         self.optimizers = optimizers
         self.schedulers = schedulers
 

From d111471a62b762dd1ac2dd1dc8fa04bd61c57fe3 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 30 Nov 2020 17:10:23 +0100
Subject: [PATCH 016/157] add general plugin interface

---
 pytorch_lightning/accelerators/base_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py
index 401dc549c5327..42b3e1f00b932 100644
--- a/pytorch_lightning/accelerators/base_plugin.py
+++ b/pytorch_lightning/accelerators/base_plugin.py
@@ -4,7 +4,7 @@
 class Plugin(object):
 
     def connect(self, model: torch.nn.Module, *args, **kwargs):
-        return model
+        pass
 
     def pre_optimizer_step(self, optimizer, optimizer_idx):
         pass

From 3d6c4b89dadcf824eb64208798a7572b8da09f3f Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 30 Nov 2020 17:10:39 +0100
Subject: [PATCH 017/157] add model properties

---
 .../accelerators/data_parallel.py             | 334 ++++++++++--------
 1 file changed, 177 insertions(+), 157 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 62a8710034af1..8281e39e71134 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -42,7 +42,7 @@ class ReduceOp:
 
 class TrainingTypePlugin(Plugin, ABC):
     def __init__(self, logger=None):
-        self.model = None
+        self._model = None
         self.global_rank = 0
         self.logger = logger
 
@@ -99,6 +99,18 @@ def determine_node_rank(self):
         rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
         return int(rank)
 
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, new_model):
+        self._model = new_model
+
+    @property
+    def lightning_module(self):
+        return self._model
+
 
 class SingleDevicePlugin(TrainingTypePlugin):
     def __init__(self, device, logger=None):
@@ -120,10 +132,10 @@ def model_to_device(self):
         if self.on_gpu:
             torch.cuda.set_device(self.root_device)
 
-        self.model.to(self.root_device)
+        self._model.to(self.root_device)
 
     def connect(self, model: torch.nn.Module):
-        self.model = model
+        self._model = model
         self.model_to_device()
 
         return self.model
@@ -174,7 +186,7 @@ def is_global_zero(self) -> bool:
 
 class DataParallelPlugin(ParallelPlugin):
     def setup(self, model):
-        self.model = LightningDataParallel(model, self.parallel_device_ids)
+        self._model = LightningDataParallel(model, self.parallel_device_ids)
 
     def reduce(self, output):
         if isinstance(output, Result):
@@ -189,6 +201,10 @@ def reduce(self, output):
     def root_device(self):
         return self.parallel_device_ids[0]
 
+    @property
+    def lightning_module(self):
+        return self._model.module
+
     def barrier(self):
         pass
 
@@ -221,7 +237,7 @@ def determine_node_rank(self):
 
     def setup(self, model):
 
-        self.model = model
+        self._model = model
 
         # start the other scripts
         if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
@@ -230,6 +246,10 @@ def setup(self, model):
         # set the task idx
         self.task_idx = int(os.environ["LOCAL_RANK"])
 
+    @property
+    def lightning_module(self):
+        return self._model.module
+
     def _call_children_scripts(self):
 
         # bookkeeping of spawned processes
@@ -320,7 +340,7 @@ def set_world_ranks(self):
     def configure_ddp(self):
         # if unset, default `find_unused_parameters` `True`
         self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self.model = LightningDistributedDataParallel(
+        self._model = LightningDistributedDataParallel(
             self.model,
             device_ids=self.determine_ddp_device_ids(),
             **self._ddp_kwargs,
@@ -431,168 +451,168 @@ def reduce(self, output, group: Optional[Any] = None,
         return output
 
 
-class DDPSpawnPlugin(ParallelPlugin):
-    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0):
-        super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
-        self.process_idx = None
+# class DDPSpawnPlugin(ParallelPlugin):
+#     def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0):
+#         super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
+#         self.process_idx = None
 
-        self.dist = LightningDistributed()
-        # TODO: how to get in nprocs? probably pass it
-        self.num_processes = num_processes
-        self.mp_queue = None
-        self.proc_offset = proc_offset
+#         self.dist = LightningDistributed()
+#         # TODO: how to get in nprocs? probably pass it
+#         self.num_processes = num_processes
+#         self.mp_queue = None
+#         self.proc_offset = proc_offset
 
-    def setup(self, model):
-        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
+#     def setup(self, model):
+#         os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
 
-        # pass in a state q
-        smp = mp.get_context('spawn')
-        self.mp_queue = smp.SimpleQueue()
+#         # pass in a state q
+#         smp = mp.get_context('spawn')
+#         self.mp_queue = smp.SimpleQueue()
 
-    def set_world_ranks(self):
-        self.local_rank = self.process_idx
-        # check from where we get node_rank, num_processes and num_nodes
-        self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx
-        self.world_size = self.num_nodes * self.num_processes
+#     def set_world_ranks(self):
+#         self.local_rank = self.process_idx
+#         # check from where we get node_rank, num_processes and num_nodes
+#         self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx
+#         self.world_size = self.num_nodes * self.num_processes
 
-    def pre_training(self):
+#     def pre_training(self):
 
-        # TODO: Check if current process can be used as one training proc
-        # start from one since current process is proc 0
-        for proc_idx in range(1, self.num_processes):
-            # use os.fork, since this enables us to continue from here 
-            # instead of spawning with separate function
-            pid = os.fork()
+#         # TODO: Check if current process can be used as one training proc
+#         # start from one since current process is proc 0
+#         for proc_idx in range(1, self.num_processes):
+#             # use os.fork, since this enables us to continue from here 
+#             # instead of spawning with separate function
+#             pid = os.fork()
 
-            # set in child processes (PID=0). All previous child processes 
-            # should already have their process_idx assigned
-            if pid == 0 and self.process_idx is None:
-                self.process_idx = proc_idx + self.proc_offset
+#             # set in child processes (PID=0). All previous child processes 
+#             # should already have their process_idx assigned
+#             if pid == 0 and self.process_idx is None:
+#                 self.process_idx = proc_idx + self.proc_offset
 
-        # set process idx for current process
-        if pid != 0:
-            self.process_idx = 0 + self.proc_offset
+#         # set process idx for current process
+#         if pid != 0:
+#             self.process_idx = 0 + self.proc_offset
 
-        # TODO: Check where to put that since we don't have access to the pbar here
-        # show progressbar only on progress_rank 0
-        # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
-        #     self.trainer.progress_bar_callback.disable()
+#         # TODO: Check where to put that since we don't have access to the pbar here
+#         # show progressbar only on progress_rank 0
+#         # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
+#         #     self.trainer.progress_bar_callback.disable()
 
-        self.set_world_ranks()
+#         self.set_world_ranks()
 
-        # set warning rank
-        rank_zero_only.rank = self.global_rank
+#         # set warning rank
+#         rank_zero_only.rank = self.global_rank
     
-        # TODO: This has to be done somewhere else!
-        # self.model.trainer = self.trainer
-
-        # set up server using proc 0's ip address
-        # try to init for 20 times at max in case ports are taken
-        # where to store ip_table
-        # TODO: CHeck is_slurm_managing_tasks
-        self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
-
-        # TODO: Move this somewhere else
-        # self.trainer.call_setup_hook(self.model)
-
-        # on world_size=0 let everyone know training is starting
-        if self.is_global_zero and not torch.distributed.is_initialized():
-            log.info("-" * 100)
-            log.info(f"distributed_backend={self.distributed_backend}")
-            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
-            log.info("-" * 100)
-
-        self.model = self.configure_sync_batchnorm(self.model)
-
-        # move the model to the correct device
-        self.model_to_device()
-
-        # TODO: Check where this can be moved
-        # set model properties before going into wrapper
-        # self.trainer.model_connector.copy_trainer_model_properties(self.model)
-
-        self.configure_ddp()
-
-        self.barrier()
-
-    def post_training(self, results, best_model_path):
-        # get original model
-        # TODO: How To get this? is this simply self.model?
-        # model = self.trainer.get_model()
-        model = self.model
-
-        # persist info in ddp_spawn
-        self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path)
-
-        # clean up memory
-        torch.cuda.empty_cache()
-
-        if self.process_idx == 0:
-            # restore main state with best weights
-            best_path = self.mp_queue.get()
-            results = self.mp_queue.get()
-            last_path = self.mp_queue.get()
-
-            # recover the weights of the processes trained in the children
-            self.__recover_child_process_weights(model, best_path, last_path)
-
-    def configure_ddp(self):
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self.model = LightningDistributedDataParallel(
-            self.model,
-            device_ids=self.determine_ddp_device_ids(),
-            **self._ddp_kwargs,
-        )
-
-    def determine_ddp_device_ids(self):
-        return [self.root_device]
-
-    def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None):
-
-        if self.global_rank == 0 and self.mp_queue is not None:
-            rank_zero_warn('cleaning up ddp environment...')
-            # todo, pass complete checkpoint as state dictionary
-            self.mp_queue.put(best_model_path)
-            self.mp_queue.put(results)
-
-            # save the last weights
-            last_path = None
-            # TODO: From where to get self.trainer.testing?
-            # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
-            if best_model_path is not None and len(best_model_path) > 0:
-                last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
-                atomic_save(self.model.state_dict(), last_path)
-            self.mp_queue.put(last_path)
-
-
-    def __recover_child_process_weights(self, model, best_path, last_path):
-        # TODO: Where can we set this?
-        # transfer back the best path to the trainer
-        # if self.trainer.checkpoint_callback:
-        #     self.trainer.checkpoint_callback.best_model_path = best_path
-        # todo, pass also best score
-
-        # load last weights
-        # TODO: How to get self.trainer.testing?
-        if last_path is not None: # and not self.trainer.testing:
-            ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
-            model.load_state_dict(ckpt)
-
-        # TODO: Where to set this?
-        # Do we really need to set this or can we just make the trainer property forward our current property here?
-        # self.trainer.model = model
-
-    def determine_local_rank(self):
-        if self.is_slurm_managing_tasks:
-            return int(os.environ['SLURM_LOCALID'])
-        else:
-            return super().determine_node_rank()
-
-    def determine_node_rank(self):
-        if self.is_slurm_managing_tasks:
-            return int(os.environ['SLURM_NODEID'])
-        else:
-            return super().determine_node_rank()
+#         # TODO: This has to be done somewhere else!
+#         # self.model.trainer = self.trainer
+
+#         # set up server using proc 0's ip address
+#         # try to init for 20 times at max in case ports are taken
+#         # where to store ip_table
+#         # TODO: CHeck is_slurm_managing_tasks
+#         self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
+
+#         # TODO: Move this somewhere else
+#         # self.trainer.call_setup_hook(self.model)
+
+#         # on world_size=0 let everyone know training is starting
+#         if self.is_global_zero and not torch.distributed.is_initialized():
+#             log.info("-" * 100)
+#             log.info(f"distributed_backend={self.distributed_backend}")
+#             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
+#             log.info("-" * 100)
+
+#         self.model = self.configure_sync_batchnorm(self.model)
+
+#         # move the model to the correct device
+#         self.model_to_device()
+
+#         # TODO: Check where this can be moved
+#         # set model properties before going into wrapper
+#         # self.trainer.model_connector.copy_trainer_model_properties(self.model)
+
+#         self.configure_ddp()
+
+#         self.barrier()
+
+#     def post_training(self, results, best_model_path):
+#         # get original model
+#         # TODO: How To get this? is this simply self.model?
+#         # model = self.trainer.get_model()
+#         model = self.model
+
+#         # persist info in ddp_spawn
+#         self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path)
+
+#         # clean up memory
+#         torch.cuda.empty_cache()
+
+#         if self.process_idx == 0:
+#             # restore main state with best weights
+#             best_path = self.mp_queue.get()
+#             results = self.mp_queue.get()
+#             last_path = self.mp_queue.get()
+
+#             # recover the weights of the processes trained in the children
+#             self.__recover_child_process_weights(model, best_path, last_path)
+
+#     def configure_ddp(self):
+#         # if unset, default `find_unused_parameters` `True`
+#         self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
+#         self.model = LightningDistributedDataParallel(
+#             self.model,
+#             device_ids=self.determine_ddp_device_ids(),
+#             **self._ddp_kwargs,
+#         )
+
+#     def determine_ddp_device_ids(self):
+#         return [self.root_device]
+
+#     def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None):
+
+#         if self.global_rank == 0 and self.mp_queue is not None:
+#             rank_zero_warn('cleaning up ddp environment...')
+#             # todo, pass complete checkpoint as state dictionary
+#             self.mp_queue.put(best_model_path)
+#             self.mp_queue.put(results)
+
+#             # save the last weights
+#             last_path = None
+#             # TODO: From where to get self.trainer.testing?
+#             # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+#             if best_model_path is not None and len(best_model_path) > 0:
+#                 last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
+#                 atomic_save(self.model.state_dict(), last_path)
+#             self.mp_queue.put(last_path)
+
+
+#     def __recover_child_process_weights(self, model, best_path, last_path):
+#         # TODO: Where can we set this?
+#         # transfer back the best path to the trainer
+#         # if self.trainer.checkpoint_callback:
+#         #     self.trainer.checkpoint_callback.best_model_path = best_path
+#         # todo, pass also best score
+
+#         # load last weights
+#         # TODO: How to get self.trainer.testing?
+#         if last_path is not None: # and not self.trainer.testing:
+#             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
+#             model.load_state_dict(ckpt)
+
+#         # TODO: Where to set this?
+#         # Do we really need to set this or can we just make the trainer property forward our current property here?
+#         # self.trainer.model = model
+
+#     def determine_local_rank(self):
+#         if self.is_slurm_managing_tasks:
+#             return int(os.environ['SLURM_LOCALID'])
+#         else:
+#             return super().determine_node_rank()
+
+#     def determine_node_rank(self):
+#         if self.is_slurm_managing_tasks:
+#             return int(os.environ['SLURM_NODEID'])
+#         else:
+#             return super().determine_node_rank()
 
 # STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP
\ No newline at end of file

From 51740e9be57aea0fe07f9b2bdd453dbd72351bd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 4 Dec 2020 23:30:49 +0100
Subject: [PATCH 018/157] Trainer integration part 1 for CPU accelerator

---
 pytorch_lightning/accelerators/__init__.py    |  0
 pytorch_lightning/accelerators/accelerator.py | 43 +++++-----
 .../accelerators/accelerator_connector.py     | 32 ++++++--
 .../accelerators/data_parallel.py             | 32 +++++---
 .../callbacks/model_checkpoint.py             |  2 +-
 pytorch_lightning/core/lightning.py           | 11 ++-
 .../connectors/checkpoint_connector.py        |  2 +-
 .../trainer/connectors/model_connector.py     |  5 +-
 pytorch_lightning/trainer/data_loading.py     | 13 +--
 pytorch_lightning/trainer/properties.py       | 80 ++++++++++++++++---
 pytorch_lightning/trainer/trainer.py          | 74 +++++++++++------
 pytorch_lightning/trainer/training_loop.py    |  7 +-
 12 files changed, 209 insertions(+), 92 deletions(-)
 create mode 100644 pytorch_lightning/accelerators/__init__.py

diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 9d84c2cbadc49..c4f5bc3a57554 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -30,10 +30,10 @@ def __init__(
         self.lr_schedulers = None
         self.optimizer_frequencies = None
 
-    def setup(self, model):
-        self.connect_training_type_plugin()
-        self.setup_optimizers(model)
-        self.connect_precision_plugin()
+    def setup(self, trainer, model):
+        self.connect_training_type_plugin(self.training_type_plugin, model)
+        self.setup_optimizers(trainer, model)
+        self.connect_precision_plugin(self.precision_plugin)
 
     @property
     def model(self):
@@ -55,7 +55,7 @@ def teardown(self):
         pass
 
     def batch_to_device(self, batch: Any, device: torch.device):
-        model = self.model_ref
+        model = self.model
         if model is not None:
             return model.transfer_batch_to_device(batch, device)
         return move_data_to_device(batch, device)
@@ -67,7 +67,7 @@ def training_step(self, args):
 
         with self.precision_plugin.train_step_context():
             with self.training_type_plugin.train_step_context():
-                return self.model_ref.training_step(*args)
+                return self.model.training_step(*args)
 
     def validation_step(self, args):
         batch = self.to_device(args[0])
@@ -76,7 +76,7 @@ def validation_step(self, args):
 
         with self.precision_plugin.val_step_context():
             with self.training_type_plugin.val_step_context():
-                return self.model_ref.validation_step(*args)
+                return self.model.validation_step(*args)
 
     def test_step(self, args):
         batch = self.to_device(args[0])
@@ -85,7 +85,7 @@ def test_step(self, args):
 
         with self.precision_plugin.test_step_context():
             with self.training_type_plugin.test_step_context():
-                return self.model_ref.test_step(*args)
+                return self.model.test_step(*args)
 
     def process_dataloader(self, dataloader):
         return dataloader
@@ -96,7 +96,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
     def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
         # TODO: Check out if this can be simplified with new LightningOptimizer!
 
-        model_ref = self.model_ref
+        model_ref = self.model
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
         native_amp = self.trainer.amp_backend == AMPType.NATIVE
 
@@ -173,20 +173,16 @@ def on_train_end(self):
     def early_stopping_should_stop(self, pl_module):
         return self.trainer.should_stop
 
-    def setup_optimizers(self, model):
-        # TODO: Check if we can change logic for early stopping to trainer completely (should be self contained)
-        if self.trainer.testing is True:
+    def setup_optimizers(self, trainer, model):
+        if trainer.testing is True:
             return
-
-        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
+        optimizers, lr_schedulers, optimizer_frequencies = trainer.init_optimizers(model)
         self.optimizers = optimizers
         self.lr_schedulers = lr_schedulers
         self.optimizer_frequencies = optimizer_frequencies
 
-    def connect_training_type_plugin(self, plugin: TrainingTypePlugin):
-        plugin.connect(
-            self.model_ref
-        )
+    def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: LightningModule):
+        plugin.connect(model)
 
     def connect_precision_plugin(self, plugin: PrecisionPlugin):
         model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers)
@@ -195,30 +191,29 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin):
         self.optimizers = optimizers
         self.schedulers = schedulers
 
-
     def to_device(self, batch):
         return self.batch_to_device(batch, self.root_device)
 
 
 class NewCPUAccelerator(NewAccelerator):
-    def setup(self, model):
+    def setup(self, trainer, model):
         if isinstance(self.precision_plugin, MixedPrecisionPlugin):
             MisconfigurationException("amp + cpu is not supported.  Please use a GPU option")
 
         if "cpu" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead")
 
-        return super().setup(model)
+        return super().setup(trainer, model)
 
 
 class NewGPUAccelerator(NewAccelerator):
-    def setup(self, model):
+    def setup(self, trainer, model):
         if "cuda" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
         torch.cuda.set_device(self.root_device)
-        self.model_ref.to(self.root_device)
+        model.to(self.root_device)
 
-        return super().setup(model)
+        return super().setup(trainer, model)
 
 
 # TODO: Add NewTPUAccelerator
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index d9a111f355e68..07fd9eb6f49a4 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -11,10 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Union
+
 from pytorch_lightning import accelerators
 import os
 import torch
 
+from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator
+from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin
+from pytorch_lightning.accelerators.precision import PrecisionPlugin
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
@@ -22,7 +27,6 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
 from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
-from pytorch_lightning.accelerators.accelerator import Accelerator
 
 try:
     import torch_xla
@@ -62,11 +66,11 @@ def __init__(
         self.use_ddp2 = False
         self.use_horovod = False
         self.use_single_gpu = False
-        self.num_gpus = None
 
         self.num_processes = num_processes
         self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
-        self.accelerator = accelerator
+        # todo: select accelerator based on trainer flags
+        self.accelerator = self.select_accelerator(accelerator)
         self.distributed_backend = distributed_backend
         self.auto_select_gpus = auto_select_gpus
         self.gpus = gpus
@@ -105,13 +109,13 @@ def __init__(
 
         # link up SLURM
         # TODO: this should be taken out of here... but depends too much on DDP
-        self.slurm_connector.on_trainer_init(self.num_nodes)
-        self.node_rank = self.determine_ddp_node_rank()
-        self.local_rank = self.determine_local_rank()
+        # self.slurm_connector.on_trainer_init(self.num_nodes)
+        # self.node_rank = self.determine_ddp_node_rank()
+        # self.local_rank = self.determine_local_rank()
         self.global_rank = 0
 
         # NVIDIA setup
-        self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
+        # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
 
         self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE')
 
@@ -132,6 +136,20 @@ def tpu_id(self):
     def on_gpu(self):
         return self.parallel_devices and torch.cuda.is_available()
 
+    @property
+    def num_gpus(self) -> int:
+        gpus = self.parallel_devices
+        if gpus is None:
+            return 0
+        return len(gpus)
+
+    def select_accelerator(self, accelerator: Union[str, NewAccelerator]):
+        return NewCPUAccelerator(
+            precision_plugin=PrecisionPlugin(),
+            training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")),
+            gradient_clip_val=None
+        )
+
     def set_distributed_mode(self):
 
         # No distributed backend
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 8281e39e71134..9d0b47c1ee345 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -46,13 +46,13 @@ def __init__(self, logger=None):
         self.global_rank = 0
         self.logger = logger
 
-    @abstractmethod
     @property
+    @abstractmethod
     def on_gpu(self):
         raise NotImplementedError
 
-    @abstractmethod
     @property
+    @abstractmethod
     def root_device(self):
         raise NotImplementedError
 
@@ -60,13 +60,17 @@ def root_device(self):
     def model_to_device(self):
         raise NotImplementedError
 
-    @abstractmethod
     @property
+    @abstractmethod
     def is_global_zero(self):
         raise NotImplementedError
 
     @abstractmethod
-    def barrier(self):
+    def barrier(self, name: Optional[str] = None):
+        raise NotImplementedError
+
+    @abstractmethod
+    def broadcast(self, obj: object, src: int = 0) -> object:
         raise NotImplementedError
 
     def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids):
@@ -79,10 +83,8 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids):
         devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')
 
-
     def determine_local_rank(self):
         return int(os.environ.get('LOCAL_RANK', 0))
-        
 
     def determine_node_rank(self):
 
@@ -144,10 +146,12 @@ def connect(self, model: torch.nn.Module):
     def is_global_zero(self):
         return True
 
-    def barrier(self):
+    def barrier(self, *args, **kwargs):
         pass
 
-    
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return obj
+
 
 class ParallelPlugin(TrainingTypePlugin, ABC):
     def __init__(self, parallel_device_ids, logger=None, cluster_environment=None):
@@ -161,8 +165,8 @@ def __init__(self, parallel_device_ids, logger=None, cluster_environment=None):
     def reduce(self, output):
         raise NotImplementedError
 
-    @abstractmethod
     @property
+    @abstractmethod
     def root_device(self):
         raise NotImplementedError
 
@@ -205,9 +209,12 @@ def root_device(self):
     def lightning_module(self):
         return self._model.module
 
-    def barrier(self):
+    def barrier(self, *args, **kwargs):
         pass
 
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return obj
+
 
 class DDPPlugin(ParallelPlugin):
 
@@ -432,10 +439,13 @@ def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
 
         return model
 
-    def barrier(self):
+    def barrier(self, *args, **kwargs):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()
 
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return self.dist.broadcast(obj)
+
     def model_to_device(self):
         # TODO: Can we easily make this a property that falls back here?
         # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 8a89cd2bef23c..32f83190e119d 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -445,7 +445,7 @@ def __resolve_ckpt_dir(self, trainer, pl_module):
                 else f"version_{trainer.logger.version}"
             )
 
-            version, name = trainer.accelerator_backend.broadcast((version, trainer.logger.name))
+            version, name = trainer.training_type_plugin.broadcast((version, trainer.logger.name))
 
             ckpt_path = os.path.join(
                 save_dir, str(name), version, "checkpoints"
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index dd5691d6e4553..33d206b6bc49d 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -126,6 +126,14 @@ def global_step(self) -> int:
         """Total training batches seen across all epochs"""
         return self.trainer.global_step if self.trainer else 0
 
+    @property
+    def global_rank(self):
+        return self.trainer.global_rank if self.trainer else 0
+
+    @property
+    def local_rank(self):
+        return self.trainer.local_rank if self.trainer else 0
+
     @example_input_array.setter
     def example_input_array(self, example: Any) -> None:
         self._example_input_array = example
@@ -253,6 +261,7 @@ def log(
                     f"Logged key: {name} should not contain information about dataloader_idx.")
 
             accelerator = self.trainer.accelerator_backend
+            training_type_plugin = self.trainer.training_type_plugin
 
             self._results.log(
                 name,
@@ -268,7 +277,7 @@ def log(
                 sync_dist,
                 sync_dist_op,
                 sync_dist_group,
-                accelerator.sync_tensor,
+                training_type_plugin.reduce,
                 self._current_dataloader_idx,
                 self.device,
             )
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 001b0b9ed3e0d..8d1a482deff15 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -73,7 +73,7 @@ def restore_weights(self, model: LightningModule) -> None:
             self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer._device_type == DeviceType.GPU)
 
         # wait for all to catch up
-        self.trainer.accelerator_backend.barrier('TrainerIOMixin.restore_weights')
+        self.trainer.training_type_plugin.barrier('TrainerIOMixin.restore_weights')
 
         # clear cache after restore
         if self.trainer._device_type == DeviceType.GPU:
diff --git a/pytorch_lightning/trainer/connectors/model_connector.py b/pytorch_lightning/trainer/connectors/model_connector.py
index a3759d1075ee5..a4bf9a6e505e6 100644
--- a/pytorch_lightning/trainer/connectors/model_connector.py
+++ b/pytorch_lightning/trainer/connectors/model_connector.py
@@ -31,16 +31,13 @@ def copy_trainer_model_properties(self, model):
 
         for m in [model, ref_model]:
             m.trainer = self.trainer
+            # TODO: add property getters to LightningModule and access through trainer reference
             m.logger = self.trainer.logger
             m._device_type = str(self.trainer._device_type)
             m._distrib_type = str(self.trainer._distrib_type)
             m.use_amp = self.trainer.amp_backend is not None
             m.testing = self.trainer.testing
-            m.tpu_local_core_rank = self.trainer.tpu_local_core_rank
-            m.tpu_global_core_rank = self.trainer.tpu_global_core_rank
             m.precision = self.trainer.precision
-            m.global_rank = self.trainer.global_rank
-            m.local_rank = self.trainer.local_rank
 
     def get_model(self):
         return self._get_reference_model(self.trainer.model)
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 38198c9f39e10..cc5fc492b3a6a 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -21,7 +21,7 @@
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import NewAccelerator
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import rank_zero_warn
@@ -51,7 +51,7 @@ class TrainerDataLoadingMixin(ABC):
     limit_val_batches: Union[int, float]
     limit_test_batches: Union[int, float]
     replace_sampler_ddp: bool
-    accelerator_backend: Accelerator
+    accelerator_backend: NewAccelerator
     num_nodes: int
     num_processes: int
     distributed_backend: Optional[str]
@@ -62,7 +62,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None:
 
         # ddp_spawn + num_workers > 0 don't mix! tell the user
         is_dataloader = isinstance(dataloader, DataLoader)
-        using_spawn = self.distributed_backend == "ddp_spawn"
+        using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn"
         if is_dataloader and not on_windows:
             if dataloader.num_workers > 0 and using_spawn:
                 rank_zero_warn('Dataloader(num_workers>0) and ddp_spawn do not mix well!'
@@ -92,8 +92,9 @@ def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
         if not is_dataloader or is_iterable_ds:
             return dataloader
 
-        need_dist_sampler = self.require_distributed_sampler and not isinstance(dataloader.sampler, DistributedSampler)
-        if self.replace_sampler_ddp and need_dist_sampler:
+        is_in_dist = self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu
+        need_dist_sampler = is_in_dist and not isinstance(dataloader.sampler, DistributedSampler)
+        if self.accelerator_connector.replace_sampler_ddp and need_dist_sampler:
             if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)):
                 raise MisconfigurationException(
                     'You seem to have configured a sampler in your DataLoader. This will be replaced '
@@ -314,7 +315,7 @@ def request_dataloader(self, dataloader_fx: Callable) -> DataLoader:
         dataloader = self._flatten_dl_only(dataloader)
 
         if self.accelerator_backend is not None:
-            self.accelerator_backend.barrier('get_dataloaders')
+            self.training_type_plugin.barrier('get_dataloaders')
         return dataloader
 
     def _flatten_dl_only(self, dataloaders):
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index c32b24458c297..6dc6802bc9021 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -17,8 +17,9 @@
 from argparse import ArgumentParser, Namespace
 from typing import cast, List, Optional, Type, TypeVar, Union
 
-from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase
+from pytorch_lightning.accelerators.accelerator import NewAccelerator
+from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
+from pytorch_lightning.callbacks import Callback, ProgressBarBase, ModelCheckpoint, EarlyStopping
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.loggers.base import LightningLoggerBase
@@ -42,6 +43,9 @@
 
 if _HOROVOD_AVAILABLE:
     import horovod.torch as hvd
+from pytorch_lightning.utilities.model_utils import is_overridden
+from pytorch_lightning.loggers.base import LightningLoggerBase
+from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
 
 
 class TrainerProperties(ABC):
@@ -59,14 +63,71 @@ class TrainerProperties(ABC):
     limit_val_batches: int
     _default_root_dir: str
     _weights_save_path: str
-    accelerator_backend: Accelerator
-    logger: LightningLoggerBase
-    model_connector: ModelConnector
-    checkpoint_connector: CheckpointConnector
-    callbacks: List[Callback]
+    accelerator_backend: NewAccelerator
     num_nodes: int
     num_processes: int
 
+    @property
+    def accelerator(self):
+        return self.accelerator_connector.accelerator
+
+    @property
+    def accelerator_backend(self):
+        # for backward compatibility
+        return self.accelerator
+
+    @property
+    def training_type_plugin(self):
+        return self.accelerator.training_type_plugin
+
+    @property
+    def global_rank(self):
+        return self.accelerator.training_type_plugin.global_rank
+
+    @property
+    def local_rank(self):
+        # some training types define a local rank
+        return getattr(self.accelerator.training_type_plugin, "local_rank", 0)
+
+    @property
+    def world_size(self):
+        # some training types define a world size
+        return getattr(self.accelerator.training_type_plugin, "world_size", 1)
+
+    @property
+    def on_gpu(self):
+        return self.accelerator_connector.on_gpu
+
+    @property
+    def on_tpu(self):
+        return self.accelerator_connector.on_tpu
+
+    @property
+    def use_dp(self):
+        return self.accelerator_connector.use_dp
+
+    @property
+    def use_ddp(self):
+        return self.accelerator_connector.use_ddp
+
+    @property
+    def use_ddp2(self):
+        return self.accelerator_connector.use_ddp2
+
+    @property
+    def use_horovod(self):
+        return self.accelerator_connector.use_horovod
+
+    @property
+    def use_single_gpu(self):
+        return self.accelerator_connector.use_single_gpu
+
+    @property
+    def use_tpu(self):
+        # TODO update this, what is the difference between use_tpu and on_tpu?
+        return False
+        # return self.accelerator_connector.use_tpu
+
     @property
     def log_dir(self):
         if self.checkpoint_callback is not None:
@@ -173,10 +234,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser:
 
     @property
     def num_gpus(self) -> int:
-        gpus = self.data_parallel_device_ids
-        if gpus is None:
-            return 0
-        return len(gpus)
+        return self.accelerator_connector.num_gpus
 
     @property
     def data_parallel(self) -> bool:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 7d7cec2335301..94c698cfb8501 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -26,6 +26,8 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
 from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
+from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.step_result import EvalResult, Result
@@ -56,13 +58,25 @@
 from pytorch_lightning.trainer.states import RunningStage, TrainerState
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
+from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
+from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
+from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
+from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
+from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
+from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
+from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
+from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
+from pytorch_lightning import _logger as log
 from pytorch_lightning.tuner.tuning import Tuner
 from pytorch_lightning.utilities import DeviceType, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import recursive_detach
-from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.model_utils import is_overridden
+from pytorch_lightning.trainer.properties import TrainerProperties
+from pytorch_lightning.plugins.plugin_connector import PluginConnector
+from pytorch_lightning.accelerators.accelerator import NewAccelerator
 
 # warnings to ignore in trainer
 warnings.filterwarnings(
@@ -111,7 +125,7 @@ def __init__(
         val_check_interval: Union[int, float] = 1.0,
         flush_logs_every_n_steps: int = 100,
         log_every_n_steps: int = 50,
-        accelerator: Optional[Union[str, Accelerator]] = None,
+        accelerator: Optional[Union[str, NewAccelerator]] = None,
         sync_batchnorm: bool = False,
         precision: int = 32,
         weights_summary: Optional[str] = 'top',
@@ -302,7 +316,20 @@ def __init__(
         self.config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self)
         self.optimizer_connector = OptimizerConnector(self)
-        self.accelerator_connector = AcceleratorConnector(self)
+        self.accelerator_connector = BackendConnector(
+            num_processes,
+            tpu_cores,
+            accelerator,
+            distributed_backend,
+            auto_select_gpus,
+            gpus,
+            num_nodes,
+            log_gpu_memory,
+            sync_batchnorm,
+            benchmark,
+            replace_sampler_ddp,
+            deterministic,
+        )
         self.logger_connector = LoggerConnector(self)
         self.model_connector = ModelConnector(self)
         self.precision_connector = PrecisionConnector(self)
@@ -313,7 +340,6 @@ def __init__(
         self.checkpoint_connector = CheckpointConnector(self)
         self.slurm_connector = SLURMConnector(self)
         self.tuner = Tuner(self)
-        self.accelerator_backend = None
         self.evaluation_loop = EvaluationLoop(self)
         self.train_loop = TrainLoop(self, multiple_trainloader_mode)
         self.plugin_connector = PluginConnector(self)
@@ -351,20 +377,20 @@ def __init__(
         )
 
         # init accelerator related flags
-        self.accelerator_connector.on_trainer_init(
-            num_processes,
-            tpu_cores,
-            accelerator,
-            distributed_backend,
-            auto_select_gpus,
-            gpus,
-            num_nodes,
-            log_gpu_memory,
-            sync_batchnorm,
-            benchmark,
-            replace_sampler_ddp,
-            deterministic,
-        )
+        # self.accelerator_connector.on_trainer_init(
+        #     num_processes,
+        #     tpu_cores,
+        #     accelerator,
+        #     distributed_backend,
+        #     auto_select_gpus,
+        #     gpus,
+        #     num_nodes,
+        #     log_gpu_memory,
+        #     sync_batchnorm,
+        #     benchmark,
+        #     replace_sampler_ddp,
+        #     deterministic,
+        # )
 
         # init train loop related flags
         # TODO: remove in 1.3.0
@@ -460,17 +486,19 @@ def fit(
         # ----------------------------
         # SET UP TRAINING
         # ----------------------------
-        self.accelerator_backend = self.accelerator_connector.select_accelerator()
+        # self.accelerator_backend = self.accelerator_connector.select_accelerator()
         self.call_hook("on_before_accelerator_backend_setup", model)
-        self.accelerator_backend.setup(model)
+        self.accelerator_backend.setup(self, model)
 
         # ----------------------------
         # INSPECT THESE FOR MAIN LOOPS
         # ----------------------------
         # assign training and eval functions... inspect these to see the train and eval loops :)
-        self.accelerator_backend.train_loop = self.train
-        self.accelerator_backend.validation_loop = self.run_evaluation
-        self.accelerator_backend.test_loop = self.run_evaluation
+        # self.accelerator_backend.train_loop = self.train
+        # self.accelerator_backend.validation_loop = self.run_evaluation
+        # self.accelerator_backend.test_loop = self.run_evaluation
+        self.train_loop.setup_training(model)
+        self.train()
 
         # ----------------------------
         # TRAIN
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 78cb08f22161f..2b1af8dfeea01 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -140,8 +140,9 @@ def setup_training(self, model: LightningModule):
         ref_model = self.trainer.get_model()
 
         # set the ranks and devices
-        self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
-        self.trainer.accelerator_backend.dist.device = ref_model.device
+        # TODO dist was a AttributeDict, should be moved to plugin?
+        # self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
+        # self.trainer.accelerator_backend.dist.device = ref_model.device
 
         # give model convenience properties
         ref_model.trainer = self.trainer
@@ -163,7 +164,7 @@ def setup_training(self, model: LightningModule):
             self.trainer.logger.save()
 
         # wait for all to join if on distributed
-        self.trainer.accelerator_backend.barrier("setup_training")
+        self.trainer.accelerator.training_type_plugin.barrier("setup_training")
 
         # register auto-resubmit when on SLURM
         self.trainer.slurm_connector.register_slurm_signal_handlers()

From 9e4856898a0e411cd1c948ab1ea4d112289d13bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 6 Dec 2020 03:08:06 +0100
Subject: [PATCH 019/157] test single gpu trainer integration

---
 pytorch_lightning/accelerators/accelerator.py | 12 +++++++++-
 .../accelerators/accelerator_connector.py     | 23 ++++++++++++++-----
 pytorch_lightning/trainer/training_loop.py    |  7 ------
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index c4f5bc3a57554..502646011e4de 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -60,6 +60,9 @@ def batch_to_device(self, batch: Any, device: torch.device):
             return model.transfer_batch_to_device(batch, device)
         return move_data_to_device(batch, device)
 
+    def on_train_start(self):
+        pass
+
     def training_step(self, args):
         batch = self.to_device(args[0])
 
@@ -215,5 +218,12 @@ def setup(self, trainer, model):
 
         return super().setup(trainer, model)
 
+    def on_train_start(self):
+        # clear cache before training
+        # use context because of:
+        # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()
+
 
-# TODO: Add NewTPUAccelerator
+# TODO: Add NewTPUAccelerator
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 07fd9eb6f49a4..d0b17c9654a04 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -17,7 +17,7 @@
 import os
 import torch
 
-from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator
+from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin
 from pytorch_lightning.accelerators.precision import PrecisionPlugin
 from pytorch_lightning.utilities import device_parser
@@ -69,8 +69,6 @@ def __init__(
 
         self.num_processes = num_processes
         self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
-        # todo: select accelerator based on trainer flags
-        self.accelerator = self.select_accelerator(accelerator)
         self.distributed_backend = distributed_backend
         self.auto_select_gpus = auto_select_gpus
         self.gpus = gpus
@@ -94,10 +92,13 @@ def __init__(
 
         self.parallel_devices = device_parser.parse_gpu_ids(self.gpus)
         self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices)
-        self.root_device = torch.device("cpu")
+        # self.root_device = torch.device("cpu")
 
         self.set_distributed_mode()
 
+        # todo: select accelerator based on trainer flags
+        self.accelerator = self.select_accelerator(accelerator)
+
         # override dist backend when using tpus
         if self.on_tpu:
             self.distributed_backend = "tpu"
@@ -143,10 +144,20 @@ def num_gpus(self) -> int:
             return 0
         return len(gpus)
 
+    def select_precision_plugin(self):
+        return PrecisionPlugin()
+
     def select_accelerator(self, accelerator: Union[str, NewAccelerator]):
-        return NewCPUAccelerator(
+
+        # return NewCPUAccelerator(
+        #     precision_plugin=PrecisionPlugin(),
+        #     training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")),
+        #     gradient_clip_val=None
+        # )
+
+        return NewGPUAccelerator(
             precision_plugin=PrecisionPlugin(),
-            training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")),
+            training_type_plugin=SingleDevicePlugin(device=torch.device("cuda", self.root_gpu)),
             gradient_clip_val=None
         )
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 2b1af8dfeea01..25540791209ff 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -101,13 +101,6 @@ def should_skip_training(self):
         return False
 
     def on_train_start(self):
-        # clear cache before training
-        if self.trainer._device_type == DeviceType.GPU and self.trainer.root_gpu is not None:
-            # use context because of:
-            # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
-            with torch.cuda.device(f"cuda:{self.trainer.root_gpu}"):
-                torch.cuda.empty_cache()
-
         # hook
         self.trainer.call_hook("on_train_start")
 

From 5da773a341acbf39711bf97b6387e6e265441b6b Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 7 Dec 2020 16:43:58 +0100
Subject: [PATCH 020/157] make device changes a bit less hardcoded

---
 .../accelerators/accelerator_connector.py     | 49 +++++++++++--------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index d0b17c9654a04..b7486d60a47b0 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -82,8 +82,8 @@ def __init__(
         # init the default rank if exists
         # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
         # this way we only show it on rank 0
-        if 'LOCAL_RANK' in os.environ:
-            rank_zero_only.rank = int(os.environ['LOCAL_RANK'])
+        if "LOCAL_RANK" in os.environ:
+            rank_zero_only.rank = int(os.environ["LOCAL_RANK"])
 
         # TODO: Move autoselect GPUS to other place
         # for gpus allow int, string and gpu list
@@ -118,7 +118,7 @@ def __init__(
         # NVIDIA setup
         # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
 
-        self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE')
+        self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE")
 
         self.replace_sampler_ddp = replace_sampler_ddp
 
@@ -147,6 +147,9 @@ def num_gpus(self) -> int:
     def select_precision_plugin(self):
         return PrecisionPlugin()
 
+    def select_training_type_plugin(self):
+        return SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
+
     def select_accelerator(self, accelerator: Union[str, NewAccelerator]):
 
         # return NewCPUAccelerator(
@@ -155,10 +158,15 @@ def select_accelerator(self, accelerator: Union[str, NewAccelerator]):
         #     gradient_clip_val=None
         # )
 
-        return NewGPUAccelerator(
-            precision_plugin=PrecisionPlugin(),
-            training_type_plugin=SingleDevicePlugin(device=torch.device("cuda", self.root_gpu)),
-            gradient_clip_val=None
+        if self.on_gpu:
+            acc_cls = NewGPUAccelerator
+        else:
+            acc_cls = NewCPUAccelerator
+
+        return acc_cls(
+            precision_plugin=self.select_precision_plugin(),
+            training_type_plugin=self.select_training_type_plugin(),
+            gradient_clip_val=None,
         )
 
     def set_distributed_mode(self):
@@ -181,7 +189,7 @@ def set_distributed_mode(self):
             # Default: DDP-Spawn
             elif self.num_gpus > 1:
                 rank_zero_warn(
-                    'You requested multiple GPUs but did not specify a backend, e.g.'
+                    "You requested multiple GPUs but did not specify a backend, e.g."
                     ' (distributed_backend="dp"|"ddp"|"ddp2").'
                     ' Setting distributed_backend="ddp_spawn" for you.'
                 )
@@ -201,8 +209,8 @@ def set_distributed_mode(self):
             if self.num_gpus == 0:
                 # DDP CPU
                 if self.num_nodes > 1 or self.num_processes > 1:
-                    self.use_ddp = True 
-            
+                    self.use_ddp = True
+
             # DDP Single GPU
             elif self.num_gpus == 1:
                 self.use_single_gpu = True
@@ -223,7 +231,7 @@ def set_distributed_mode(self):
         elif self.distributed_backend == "ddp_cpu":
             if self.num_gpus > 0:
                 rank_zero_warn(
-                    'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
+                    "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs."
                 )
             self.use_ddp = True
             self.data_parallel_device_ids = None
@@ -236,18 +244,17 @@ def set_distributed_mode(self):
         # throw error to force user ddp or ddp2 choice
         if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp):
             raise MisconfigurationException(
-                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
-                'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2'
+                "DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. "
+                "To silence this warning set distributed_backend=ddp or distributed_backend=ddp2"
             )
 
-        rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}')
+        rank_zero_info(f"GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}")
         num_cores = self.tpu_cores if self.tpu_cores is not None else 0
-        rank_zero_info(f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores')
+        rank_zero_info(f"TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores")
 
         if torch.cuda.is_available() and not self.on_gpu:
-            rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.')
+            rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.")
 
-    
     def _set_horovod_backend(self):
         self.check_horovod()
         self.use_horovod = True
@@ -263,16 +270,16 @@ def check_horovod(self):
         if not HOROVOD_AVAILABLE:
             raise MisconfigurationException(
                 'Requested `distributed_backend="horovod"`, but Horovod is not installed.'
-                'Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]'
+                "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]"
             )
 
         if self.num_gpus > 1 or self.num_nodes > 1:
             raise MisconfigurationException(
-                'Horovod does not support setting num_nodes / num_gpus explicitly. Use '
-                'horovodrun / mpirun to configure the number of processes.'
+                "Horovod does not support setting num_nodes / num_gpus explicitly. Use "
+                "horovodrun / mpirun to configure the number of processes."
             )
 
     @staticmethod
     def has_horovodrun():
         """Returns True if running with `horovodrun` using Gloo or OpenMPI."""
-        return 'OMPI_COMM_WORLD_RANK' in os.environ or 'HOROVOD_RANK' in os.environ
\ No newline at end of file
+        return "OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ

From 42e53beb84717b0ed69636c9023cda430eb6aa3e Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 7 Dec 2020 16:44:13 +0100
Subject: [PATCH 021/157] properly resolve attributes

---
 pytorch_lightning/accelerators/accelerator.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 502646011e4de..e2f044fab612f 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -70,7 +70,7 @@ def training_step(self, args):
 
         with self.precision_plugin.train_step_context():
             with self.training_type_plugin.train_step_context():
-                return self.model.training_step(*args)
+                return self.lightning_module.training_step(*args)
 
     def validation_step(self, args):
         batch = self.to_device(args[0])
@@ -79,7 +79,7 @@ def validation_step(self, args):
 
         with self.precision_plugin.val_step_context():
             with self.training_type_plugin.val_step_context():
-                return self.model.validation_step(*args)
+                return self.lightning_module.validation_step(*args)
 
     def test_step(self, args):
         batch = self.to_device(args[0])
@@ -88,7 +88,7 @@ def test_step(self, args):
 
         with self.precision_plugin.test_step_context():
             with self.training_type_plugin.test_step_context():
-                return self.model.test_step(*args)
+                return self.lightning_module.test_step(*args)
 
     def process_dataloader(self, dataloader):
         return dataloader
@@ -99,14 +99,14 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
     def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
         # TODO: Check out if this can be simplified with new LightningOptimizer!
 
-        model_ref = self.model
+        model_ref = self.lightning_module
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
-        native_amp = self.trainer.amp_backend == AMPType.NATIVE
+        native_amp = isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.NATIVE
 
-        self.precision_plugin.pre_optimizer_step(optimizer)
+        self.precision_plugin.pre_optimizer_step(optimizer, opt_idx)
 
         # model hook
-        model_ref.optimizer_step(
+        res = model_ref.optimizer_step(
             epoch=current_epoch,
             batch_idx=batch_idx,
             optimizer=optimizer,
@@ -118,6 +118,7 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl
         )
 
         self.precision_plugin.post_optimizer_step()
+        return res
 
     def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx):
         model_ref = self.model_ref
@@ -134,7 +135,7 @@ def clip_gradients(self, optimizer, clip_val=None):
             return
         self._clip_gradients(optimizer, grad_clip_val)
 
-        model = self.model_ref
+        model = self.lightning_module
 
         # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX
         if self.trainer.amp_backend == AMPType.APEX:
@@ -198,6 +199,7 @@ def to_device(self, batch):
         return self.batch_to_device(batch, self.root_device)
 
 
+
 class NewCPUAccelerator(NewAccelerator):
     def setup(self, trainer, model):
         if isinstance(self.precision_plugin, MixedPrecisionPlugin):

From 4c8d24fb27689b3894d3521bda140675fd12a697 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 7 Dec 2020 16:44:36 +0100
Subject: [PATCH 022/157] add properties for accelerator forwarding

---
 pytorch_lightning/trainer/trainer.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 94c698cfb8501..e114db42956ae 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -445,6 +445,30 @@ def __init__(
         # Callback system
         self.on_init_end()
 
+    @property
+    def optimizers(self):
+        return self.accelerator_backend.optimizers
+
+    @optimizers.setter
+    def optimizers(self, new_optims):
+        self.accelerator_backend.optimizers = new_optims
+
+    @property
+    def lr_schedulers(self):
+        return self.accelerator_backend.lr_schedulers
+
+    @lr_schedulers.setter
+    def lr_schedulers(self, new_schedulers):
+        self.accelerator_backend.lr_schedulers = new_schedulers
+
+    @property
+    def optimizer_frequencies(self):
+        return self.accelerator_backend.optimizer_frequencies
+
+    @optimizer_frequencies.setter
+    def optimizer_frequencies(self, new_freqs):
+        self.accelerator_backend.optimizer_frequencies = new_freqs
+
     def fit(
         self,
         model: LightningModule,

From 6faebfa4f5bad37d52ec66c20ab35e8ae83bf6b7 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 7 Dec 2020 16:44:55 +0100
Subject: [PATCH 023/157] correct optimizer_step calls

---
 pytorch_lightning/trainer/training_loop.py | 27 ++++------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 25540791209ff..5dcf17f99f7a7 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -484,28 +484,11 @@ def _process_result(self, training_step_output, split_batch):
         return training_step_output_for_epoch_end
 
     def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure):
-        model_ref = self.trainer.get_model()
-
-        is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
-        using_native_amp = self.trainer.amp_backend == AMPType.NATIVE
-
-        # native amp + lbfgs is a no go right now
-        if using_native_amp and is_lbfgs:
-            raise MisconfigurationException(
-                'native PyTorch amp and lbfgs are not compatible.'
-                ' To request, please file a Github issue in PyTorch and tag @mcarilli')
-
-        # model hook
-        model_ref.optimizer_step(
-            self.trainer.current_epoch,
-            batch_idx,
-            optimizer,
-            opt_idx,
-            train_step_and_backward_closure,
-            on_tpu=self.trainer._device_type == DeviceType.TPU and _TPU_AVAILABLE,
-            using_native_amp=using_native_amp,
-            using_lbfgs=is_lbfgs,
-        )
+        with self.trainer.profiler.profile("optimizer_step"):
+            # optimizer step lightningModule hook
+            self.trainer.accelerator_backend.optimizer_step(
+                optimizer, self.trainer.current_epoch, batch_idx, opt_idx, train_step_and_backward_closure
+            )
 
     def on_before_zero_grad(self, optimizer):
         self.trainer.call_hook('on_before_zero_grad', optimizer)

From 29568e1d6200089dd9cccbf2592df80f94e7832b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 7 Dec 2020 16:24:55 +0100
Subject: [PATCH 024/157] call train or test

---
 pytorch_lightning/trainer/trainer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index e114db42956ae..6e91dddb32b12 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -521,8 +521,8 @@ def fit(
         # self.accelerator_backend.train_loop = self.train
         # self.accelerator_backend.validation_loop = self.run_evaluation
         # self.accelerator_backend.test_loop = self.run_evaluation
+
         self.train_loop.setup_training(model)
-        self.train()
 
         # ----------------------------
         # TRAIN
@@ -530,7 +530,11 @@ def fit(
         # hook
         self.call_hook('on_fit_start')
 
-        results = self.accelerator_backend.train()
+        if self.testing:
+            results = self.run_test()
+        else:
+            results = self.train()
+
         self.accelerator_backend.teardown()
 
         # ----------------------------

From 33561d779950747e7dd007bca4e92d13c7f26a59 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 7 Dec 2020 17:01:40 +0100
Subject: [PATCH 025/157] make calls to trainstep (ad fix bugs)

---
 pytorch_lightning/accelerators/accelerator.py | 10 ++++--
 pytorch_lightning/accelerators/precision.py   | 31 ++++++++++++-------
 pytorch_lightning/trainer/training_loop.py    |  3 ++
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index e2f044fab612f..7726f143093d5 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -94,7 +94,7 @@ def process_dataloader(self, dataloader):
         return dataloader
 
     def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
-        return self.precision_plugin.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
+        return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, *args, **kwargs)
 
     def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
         # TODO: Check out if this can be simplified with new LightningOptimizer!
@@ -117,11 +117,11 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl
             using_lbfgs=is_lbfgs,
         )
 
-        self.precision_plugin.post_optimizer_step()
+        self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
         return res
 
     def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx):
-        model_ref = self.model_ref
+        model_ref = self.lightning_module
         model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx)
 
     def clip_gradients(self, optimizer, clip_val=None):
@@ -129,6 +129,10 @@ def clip_gradients(self, optimizer, clip_val=None):
         grad_clip_val = self.gradient_clip_val
         if clip_val is not None:
             grad_clip_val = clip_val
+
+        if grad_clip_val is None:
+            return
+        
         grad_clip_val = float(grad_clip_val)
 
         if grad_clip_val <= 0:
diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
index ca41e8242f104..d0db65fa12dbb 100644
--- a/pytorch_lightning/accelerators/precision.py
+++ b/pytorch_lightning/accelerators/precision.py
@@ -33,6 +33,21 @@ def master_params(self, optimizer):
     def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
         return model, optimizers, lr_schedulers
 
+    def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs):
+        # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
+        automatic_optimization = model.automatic_optimization
+
+        # do backward pass
+        if automatic_optimization:
+            model.backward(closure_loss, optimizer, opt_idx)
+        else:
+            closure_loss.backward(*args, **kwargs)
+
+        # once backward has been applied, release graph
+        closure_loss = closure_loss.detach()
+
+        return closure_loss
+
 
 class MixedPrecisionPlugin(PrecisionPlugin):
     EPSILON = 1e-5
@@ -55,21 +70,13 @@ def pre_optimizer_step(self, optimizer, optimizer_idx):
     def post_optimizer_step(self, optimizer, optimizer_idx):
         self.scaler.update()
 
-    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+    def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs):
         closure_loss = self.scaler.scale(closure_loss)
 
         # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
-        automatic_optimization = self.trainer.train_loop.automatic_optimization
+        automatic_optimization = model.automatic_optimization
 
-        # do backward pass
-        if automatic_optimization:
-            model = self.trainer.get_model()
-            model.backward(closure_loss, optimizer, opt_idx)
-        else:
-            closure_loss.backward(*args, **kwargs)
-
-        # once backward has been applied, release graph
-        closure_loss = closure_loss.detach()
+        closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, *args, **kwargs)
 
         # unscale gradient to allow analyze within `on_after_backward`
         # TODO: Check from where we can get the should_accumulate value (maybe pass it as argument?)
@@ -101,7 +108,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         closure_loss = closure_loss.__enter__()
 
         # do backward pass
-        if self.trainer.train_loop.automatic_optimization:
+        if self.lightning_module:
             model = self.trainer.get_model()
             model.backward(closure_loss, optimizer, opt_idx)
         else:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 5dcf17f99f7a7..231d303c2942f 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -493,6 +493,9 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_
     def on_before_zero_grad(self, optimizer):
         self.trainer.call_hook('on_before_zero_grad', optimizer)
 
+    def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
+        self.trainer.accelerator_backend.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
+
     def track_and_norm_grad(self, optimizer):
         # track gradient norms
         grad_norm_dic = self._track_gradient_norm()

From ef947554c29a83dffed0039e27683fbc42ba8dc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 7 Dec 2020 17:26:58 +0100
Subject: [PATCH 026/157] remove gradient_clip_val from accelerator

---
 pytorch_lightning/accelerators/accelerator.py         | 11 ++++-------
 .../accelerators/accelerator_connector.py             |  1 -
 pytorch_lightning/trainer/training_loop.py            |  2 +-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 7726f143093d5..3d6f4ef92cea7 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -20,11 +20,9 @@ def __init__(
         self,
         precision_plugin: PrecisionPlugin,
         training_type_plugin: TrainingTypePlugin,
-        gradient_clip_val,
     ):
         self.precision_plugin = precision_plugin
         self.training_type_plugin = training_type_plugin
-        self.gradient_clip_val = gradient_clip_val
 
         self.optimizers = None
         self.lr_schedulers = None
@@ -124,12 +122,11 @@ def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx):
         model_ref = self.lightning_module
         model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx)
 
-    def clip_gradients(self, optimizer, clip_val=None):
-        # use the trainer's clip val if none passed
-        grad_clip_val = self.gradient_clip_val
-        if clip_val is not None:
-            grad_clip_val = clip_val
+    def clip_gradients(self, optimizer, clip_val):
+        # TODO: separate TPU case from here
+        self._clip_gradients(optimizer, clip_val)
 
+    def _clip_gradients(self, optimizer, grad_clip_val):
         if grad_clip_val is None:
             return
         
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index b7486d60a47b0..2412da6e0d773 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -166,7 +166,6 @@ def select_accelerator(self, accelerator: Union[str, NewAccelerator]):
         return acc_cls(
             precision_plugin=self.select_precision_plugin(),
             training_type_plugin=self.select_training_type_plugin(),
-            gradient_clip_val=None,
         )
 
     def set_distributed_mode(self):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 231d303c2942f..0087f5d36f52c 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -501,7 +501,7 @@ def track_and_norm_grad(self, optimizer):
         grad_norm_dic = self._track_gradient_norm()
 
         # clip gradients
-        self.trainer.accelerator_backend.clip_gradients(optimizer)
+        self.trainer.accelerator_backend.clip_gradients(optimizer, self.trainer.gradient_clip_val)
         self._cur_grad_norm_dict = grad_norm_dic
 
     def _track_gradient_norm(self):

From c5e989283d251a7dcd76d32fa7c3d6f8bb0c845c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 7 Dec 2020 17:44:35 +0100
Subject: [PATCH 027/157] add back the step end methods

---
 pytorch_lightning/accelerators/accelerator.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 3d6f4ef92cea7..59d011d4de163 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -88,6 +88,15 @@ def test_step(self, args):
             with self.training_type_plugin.test_step_context():
                 return self.lightning_module.test_step(*args)
 
+    def training_step_end(self, output):
+        return output
+
+    def test_step_end(self, output):
+        return output
+
+    def validation_step_end(self, output):
+        return output
+
     def process_dataloader(self, dataloader):
         return dataloader
 

From c02baadc2323f13fa51057aef0a7f96edaa6818f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 7 Dec 2020 17:45:57 +0100
Subject: [PATCH 028/157] add precision todo comment

---
 pytorch_lightning/accelerators/accelerator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 59d011d4de163..bfd4ba5ad86ac 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -148,6 +148,7 @@ def _clip_gradients(self, optimizer, grad_clip_val):
         model = self.lightning_module
 
         # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX
+        #  ... or we call master_params() and in the default plugin we return the model.parameters()
         if self.trainer.amp_backend == AMPType.APEX:
             parameters = self.precision_plugin.master_params(optimizer)
         else:

From ce4eafa532bffd2488eb941cb04f47e65ffb7170 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 9 Dec 2020 00:16:45 +0100
Subject: [PATCH 029/157] ddp

---
 pl_examples/bug_report_model.py               | 23 +++-------
 pytorch_lightning/accelerators/accelerator.py |  4 +-
 .../accelerators/accelerator_connector.py     | 32 ++++++++------
 .../accelerators/data_parallel.py             | 42 +++++++++++++------
 .../trainer/connectors/env_vars_connector.py  |  5 +++
 pytorch_lightning/trainer/properties.py       | 19 ++++++++-
 pytorch_lightning/trainer/trainer.py          |  6 ++-
 pytorch_lightning/trainer/training_loop.py    |  7 +---
 pytorch_lightning/utilities/device_parser.py  |  8 ++--
 9 files changed, 89 insertions(+), 57 deletions(-)

diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index 1351048711df4..f480847938e6f 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -36,10 +36,8 @@ class RandomDataset(Dataset):
     def __init__(self, size, length):
         self.len = length
         self.data = torch.randn(length, size)
-
     def __getitem__(self, index):
         return self.data[index]
-
     def __len__(self):
         return self.len
 
@@ -55,63 +53,52 @@ class BoringModel(LightningModule):
     def __init__(self):
         """
         Testing PL Module
-
         Use as follows:
         - subclass
         - modify the behavior for what you want
-
         class TestModel(BaseTestModel):
             def training_step(...):
                 # do your own thing
-
         or:
-
         model = BaseTestModel()
         model.training_epoch_end = None
-
         """
         super().__init__()
         self.layer = torch.nn.Linear(32, 2)
 
+    @property
+    def automatic_optimization(self):
+        return True
+
     def forward(self, x):
         return self.layer(x)
-
     def loss(self, batch, prediction):
         # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
         return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
-
     def step(self, x):
-        x = self.layer(x)
+        x = self(x)
         out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
         return out
-
     def training_step(self, batch, batch_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"loss": loss}
-
     def training_step_end(self, training_step_outputs):
         return training_step_outputs
-
     def training_epoch_end(self, outputs) -> None:
         torch.stack([x["loss"] for x in outputs]).mean()
-
     def validation_step(self, batch, batch_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"x": loss}
-
     def validation_epoch_end(self, outputs) -> None:
         torch.stack([x['x'] for x in outputs]).mean()
-
     def test_step(self, batch, batch_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"y": loss}
-
     def test_epoch_end(self, outputs) -> None:
         torch.stack([x["y"] for x in outputs]).mean()
-
     def configure_optimizers(self):
         optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
         lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index bfd4ba5ad86ac..82f822c16a918 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -29,7 +29,9 @@ def __init__(
         self.optimizer_frequencies = None
 
     def setup(self, trainer, model):
+        print(trainer.global_rank, "Accelerator.setup")
         self.connect_training_type_plugin(self.training_type_plugin, model)
+        self.training_type_plugin.setup(model)
         self.setup_optimizers(trainer, model)
         self.connect_precision_plugin(self.precision_plugin)
 
@@ -53,7 +55,7 @@ def teardown(self):
         pass
 
     def batch_to_device(self, batch: Any, device: torch.device):
-        model = self.model
+        model = self.lightning_module
         if model is not None:
             return model.transfer_batch_to_device(batch, device)
         return move_data_to_device(batch, device)
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 2412da6e0d773..a9327c87138ed 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -18,7 +18,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
-from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin
+from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin
 from pytorch_lightning.accelerators.precision import PrecisionPlugin
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities import rank_zero_only
@@ -48,7 +48,6 @@ def __init__(
         self,
         num_processes,
         tpu_cores,
-        accelerator,
         distributed_backend,
         auto_select_gpus,
         gpus,
@@ -89,7 +88,6 @@ def __init__(
         # for gpus allow int, string and gpu list
         # if auto_select_gpus and isinstance(gpus, int):
         #     self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus)
-
         self.parallel_devices = device_parser.parse_gpu_ids(self.gpus)
         self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices)
         # self.root_device = torch.device("cpu")
@@ -97,7 +95,7 @@ def __init__(
         self.set_distributed_mode()
 
         # todo: select accelerator based on trainer flags
-        self.accelerator = self.select_accelerator(accelerator)
+        self.accelerator = self.select_accelerator()
 
         # override dist backend when using tpus
         if self.on_tpu:
@@ -148,15 +146,23 @@ def select_precision_plugin(self):
         return PrecisionPlugin()
 
     def select_training_type_plugin(self):
-        return SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
-
-    def select_accelerator(self, accelerator: Union[str, NewAccelerator]):
-
-        # return NewCPUAccelerator(
-        #     precision_plugin=PrecisionPlugin(),
-        #     training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")),
-        #     gradient_clip_val=None
-        # )
+        if self.distributed_backend == "ddp":
+            plugin = DDPPlugin(
+                parallel_device_ids=self.parallel_devices,
+                num_nodes=self.num_nodes,
+                logger=None,
+                cluster_environment=TorchElasticEnvironment(),  # TODO: deterimine this using plugin connector?
+                is_slurm_managing_tasks=False,  # TODO: determine this
+            )
+        else:
+            # TODO: cover all other cases
+            plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
+        return plugin
+
+    def select_accelerator(self):
+        if isinstance(self.distributed_backend, NewAccelerator):
+            # custom accelerator from user
+            return self.distributed_backend
 
         if self.on_gpu:
             acc_cls = NewGPUAccelerator
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 9d0b47c1ee345..0e63bc2b91f03 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -53,7 +53,7 @@ def on_gpu(self):
 
     @property
     @abstractmethod
-    def root_device(self):
+    def root_device(self) -> torch.device:
         raise NotImplementedError
 
     @abstractmethod
@@ -203,7 +203,7 @@ def reduce(self, output):
 
     @property
     def root_device(self):
-        return self.parallel_device_ids[0]
+        return torch.device("cuda", self.parallel_device_ids[0])
 
     @property
     def lightning_module(self):
@@ -220,15 +220,28 @@ class DDPPlugin(ParallelPlugin):
 
     distributed_backend = "ddp"
 
-    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None) -> None:
+    def __init__(
+            self,
+            parallel_device_ids,
+            num_nodes=1,
+            logger=None,
+            cluster_environment=None,
+            is_slurm_managing_tasks=False,
+            **kwargs: Dict[str, Any],
+    ) -> None:
         super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
-        self._has_spawned_children = False
         self.interactive_ddp_procs = []
         self.dist = LightningDistributed()
+        self.num_nodes = num_nodes
+        self.is_slurm_managing_tasks = is_slurm_managing_tasks
+        self._ddp_kwargs = kwargs
+        self._has_spawned_children = False
+        self.task_idx = None
+        self.num_processes = len(parallel_device_ids)
 
     @property
     def root_device(self):
-        return self.parallel_device_ids[self.local_rank]
+        return torch.device("cuda", self.parallel_device_ids[self.local_rank])
 
     def determine_local_rank(self):
         if self.is_slurm_managing_tasks:
@@ -243,6 +256,7 @@ def determine_node_rank(self):
             return super().determine_node_rank()
 
     def setup(self, model):
+        print("DDPPlugin.setup")
 
         self._model = model
 
@@ -302,7 +316,7 @@ def _call_children_scripts(self):
         if self.logger is not None:
             os.environ["PL_EXP_VERSION"] = str(self.logger.version)
 
-        num_gpus = len(self.data_parallel_device_ids)
+        num_gpus = len(self.parallel_device_ids)
         # TODO: Add num_nodes (pass it in?)
         os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
 
@@ -354,7 +368,7 @@ def configure_ddp(self):
         )
 
     def determine_ddp_device_ids(self):
-        return [self.root_device]
+        return [self.root_device.index]
 
     def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
         # TODO: From where to get cluster environment?
@@ -390,7 +404,7 @@ def pre_training(self):
         # try to init for 20 times at max in case ports are taken
         # where to store ip_table
         # TODO: CHeck is_slurm_managing_tasks
-        self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
+        self.init_ddp_connection(self.global_rank, self.world_size)
 
         # TODO: Move this somewhere else
         # self.trainer.call_setup_hook(self.model)
@@ -402,6 +416,11 @@ def pre_training(self):
             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
             log.info("-" * 100)
 
+        # TODO: I moved this from training loop to here, is it the right place?
+        # set the ranks and devices
+        self.dist.rank = self.global_rank
+        self.dist.device = self.root_device
+
         self.model = self.configure_sync_batchnorm(self.model)
 
         # move the model to the correct device
@@ -450,14 +469,11 @@ def model_to_device(self):
         # TODO: Can we easily make this a property that falls back here?
         # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
         torch.cuda.set_device(self.root_device)
-        self.model.cuda(self.root_device)
-
-    def reduce(self, output, group: Optional[Any] = None,
-                    reduce_op: Optional[Union[ReduceOp, str]] = None):
+        self.model.to(self.root_device)
 
+    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
         if isinstance(output, torch.Tensor):
             output = sync_ddp_if_available(output, group, reduce_op)
-        
         return output
 
 
diff --git a/pytorch_lightning/trainer/connectors/env_vars_connector.py b/pytorch_lightning/trainer/connectors/env_vars_connector.py
index e4d5670b5fe78..29a6dd137c021 100644
--- a/pytorch_lightning/trainer/connectors/env_vars_connector.py
+++ b/pytorch_lightning/trainer/connectors/env_vars_connector.py
@@ -28,6 +28,9 @@ def overwrite_by_env_vars(fn: Callable) -> Callable:
     def overwrite_by_env_vars(self, *args, **kwargs):
         # get the class
         cls = self.__class__
+
+        print("before", kwargs["gpus"])
+
         if args:  # inace any args passed move them to kwargs
             # parse only the argument names
             cls_arg_names = [arg[0] for arg in get_init_arguments_and_types(cls)]
@@ -37,6 +40,8 @@ def overwrite_by_env_vars(self, *args, **kwargs):
         # todo: maybe add a warning that some init args were overwritten by Env arguments
         kwargs.update(vars(parse_env_variables(cls)))
 
+        print("after", kwargs["gpus"])
+
         # all args were already moved to kwargs
         return fn(self, **kwargs)
 
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 6dc6802bc9021..cb613dc087691 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -76,6 +76,11 @@ def accelerator_backend(self):
         # for backward compatibility
         return self.accelerator
 
+    @property
+    def distributed_backend(self):
+        # for backward compatibility
+        return self.accelerator_connector.distributed_backend
+
     @property
     def training_type_plugin(self):
         return self.accelerator.training_type_plugin
@@ -128,6 +133,14 @@ def use_tpu(self):
         return False
         # return self.accelerator_connector.use_tpu
 
+    @property
+    def num_nodes(self):
+        return self.accelerator_connector.num_gpus
+
+    @property
+    def num_processes(self):
+        return self.accelerator_connector.num_processes
+
     @property
     def log_dir(self):
         if self.checkpoint_callback is not None:
@@ -261,7 +274,7 @@ def disable_validation(self) -> bool:
     @property
     def enable_validation(self) -> bool:
         """ Check if we should run validation during training. """
-        model_ref = self.model_connector.get_model()
+        model_ref = self.get_model()
         val_loop_enabled = is_overridden('validation_step', model_ref) and self.limit_val_batches > 0
         return val_loop_enabled
 
@@ -323,7 +336,9 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
         self.checkpoint_connector.save_checkpoint(filepath, weights_only)
 
     def get_model(self):
-        return self.model_connector.get_model()
+        # TODO: rename this to lightning_module (see training type plugin)
+        # backward compatible
+        return self.training_type_plugin.lightning_module
 
     def __getstate__(self):
         # unwrap optimizer
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 6e91dddb32b12..b71d9ced7e0e6 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -311,6 +311,10 @@ def __init__(
         self._distrib_type = None
         self._running_stage = None
 
+        distributed_backend = distributed_backend or accelerator
+
+        print("gpus passed into trainer", gpus)
+
         # init connectors
         self.dev_debugger = InternalDebugger(self)
         self.config_validator = ConfigValidator(self)
@@ -319,7 +323,6 @@ def __init__(
         self.accelerator_connector = BackendConnector(
             num_processes,
             tpu_cores,
-            accelerator,
             distributed_backend,
             auto_select_gpus,
             gpus,
@@ -513,6 +516,7 @@ def fit(
         # self.accelerator_backend = self.accelerator_connector.select_accelerator()
         self.call_hook("on_before_accelerator_backend_setup", model)
         self.accelerator_backend.setup(self, model)
+        self.training_type_plugin.pre_training()
 
         # ----------------------------
         # INSPECT THESE FOR MAIN LOOPS
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 0087f5d36f52c..28bbb5a4f722c 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -130,13 +130,10 @@ def setup_training(self, model: LightningModule):
         # --------------------------
         # Setup??
         # --------------------------
+        # ref_model = self.trainer.get_model()
+        print(self.trainer.global_rank, type(model))
         ref_model = self.trainer.get_model()
 
-        # set the ranks and devices
-        # TODO dist was a AttributeDict, should be moved to plugin?
-        # self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
-        # self.trainer.accelerator_backend.dist.device = ref_model.device
-
         # give model convenience properties
         ref_model.trainer = self.trainer
 
diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py
index b1bd62277aa18..9417bc13e8e8b 100644
--- a/pytorch_lightning/utilities/device_parser.py
+++ b/pytorch_lightning/utilities/device_parser.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, MutableSequence, Optional, Union
+from typing import Any, List, MutableSequence, Optional, Tuple, Union
 
 import torch
 
@@ -146,9 +146,9 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]:
     return gpus
 
 
-def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int]]) -> Optional[List[int]]:
+def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int], Tuple[int, ...]]) -> Optional[List[int]]:
     assert gpus is not None
-    if isinstance(gpus, MutableSequence):
+    if isinstance(gpus, (list, tuple)):
         return list(gpus)
 
     # must be an int
@@ -177,7 +177,7 @@ def _check_data_type(device_ids: Any) -> None:
         device_ids: gpus/tpu_cores parameter as passed to the Trainer
     """
     if device_ids is not None and \
-            (not isinstance(device_ids, (int, str, MutableSequence)) or isinstance(device_ids, bool)):
+            (not isinstance(device_ids, (int, str, list, tuple)) or isinstance(device_ids, bool)):
         raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.")
 
 
From e6ba00982c3a784851227fee0ee872a92fa4bcb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 9 Dec 2020 00:21:06 +0100
Subject: [PATCH 030/157] clean up

---
 pytorch_lightning/accelerators/accelerator.py              | 1 -
 pytorch_lightning/accelerators/data_parallel.py            | 2 --
 pytorch_lightning/trainer/connectors/env_vars_connector.py | 4 ----
 pytorch_lightning/trainer/trainer.py                       | 2 --
 pytorch_lightning/trainer/training_loop.py                 | 4 +---
 5 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 82f822c16a918..6bc7cdeca612b 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -29,7 +29,6 @@ def __init__(
         self.optimizer_frequencies = None
 
     def setup(self, trainer, model):
-        print(trainer.global_rank, "Accelerator.setup")
         self.connect_training_type_plugin(self.training_type_plugin, model)
         self.training_type_plugin.setup(model)
         self.setup_optimizers(trainer, model)
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 0e63bc2b91f03..801015afaff79 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -256,8 +256,6 @@ def determine_node_rank(self):
             return super().determine_node_rank()
 
     def setup(self, model):
-        print("DDPPlugin.setup")
-
         self._model = model
 
         # start the other scripts
diff --git a/pytorch_lightning/trainer/connectors/env_vars_connector.py b/pytorch_lightning/trainer/connectors/env_vars_connector.py
index 29a6dd137c021..6b907d288c5ca 100644
--- a/pytorch_lightning/trainer/connectors/env_vars_connector.py
+++ b/pytorch_lightning/trainer/connectors/env_vars_connector.py
@@ -29,8 +29,6 @@ def overwrite_by_env_vars(self, *args, **kwargs):
         # get the class
         cls = self.__class__
 
-        print("before", kwargs["gpus"])
-
         if args:  # inace any args passed move them to kwargs
             # parse only the argument names
             cls_arg_names = [arg[0] for arg in get_init_arguments_and_types(cls)]
@@ -40,8 +38,6 @@ def overwrite_by_env_vars(self, *args, **kwargs):
         # todo: maybe add a warning that some init args were overwritten by Env arguments
         kwargs.update(vars(parse_env_variables(cls)))
 
-        print("after", kwargs["gpus"])
-
         # all args were already moved to kwargs
         return fn(self, **kwargs)
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index b71d9ced7e0e6..6582fa6421c80 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -313,8 +313,6 @@ def __init__(
 
         distributed_backend = distributed_backend or accelerator
 
-        print("gpus passed into trainer", gpus)
-
         # init connectors
         self.dev_debugger = InternalDebugger(self)
         self.config_validator = ConfigValidator(self)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 28bbb5a4f722c..e8aefb53ad699 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -130,9 +130,7 @@ def setup_training(self, model: LightningModule):
         # --------------------------
         # Setup??
         # --------------------------
-        # ref_model = self.trainer.get_model()
-        print(self.trainer.global_rank, type(model))
-        ref_model = self.trainer.get_model()
+        ref_model = model
 
         # give model convenience properties
         ref_model.trainer = self.trainer

From fa4d84432ea4a8857f54481fef6b3245c5108fcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 9 Dec 2020 00:33:16 +0100
Subject: [PATCH 031/157] connect

---
 pl_examples/bug_report_model.py                 | 12 ++++++++++++
 pytorch_lightning/accelerators/accelerator.py   |  2 +-
 pytorch_lightning/accelerators/data_parallel.py |  2 --
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index f480847938e6f..03ccd47e09d97 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -36,8 +36,10 @@ class RandomDataset(Dataset):
     def __init__(self, size, length):
         self.len = length
         self.data = torch.randn(length, size)
+
     def __getitem__(self, index):
         return self.data[index]
+
     def __len__(self):
         return self.len
 
@@ -72,33 +74,43 @@ def automatic_optimization(self):
 
     def forward(self, x):
         return self.layer(x)
+
     def loss(self, batch, prediction):
         # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
         return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
+
     def step(self, x):
         x = self(x)
         out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
         return out
+
     def training_step(self, batch, batch_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"loss": loss}
+
     def training_step_end(self, training_step_outputs):
         return training_step_outputs
+
     def training_epoch_end(self, outputs) -> None:
         torch.stack([x["loss"] for x in outputs]).mean()
+
     def validation_step(self, batch, batch_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"x": loss}
+
     def validation_epoch_end(self, outputs) -> None:
         torch.stack([x['x'] for x in outputs]).mean()
+
     def test_step(self, batch, batch_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
         return {"y": loss}
+
     def test_epoch_end(self, outputs) -> None:
         torch.stack([x["y"] for x in outputs]).mean()
+
     def configure_optimizers(self):
         optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
         lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 6bc7cdeca612b..8f38c70d69cc0 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -30,7 +30,7 @@ def __init__(
 
     def setup(self, trainer, model):
         self.connect_training_type_plugin(self.training_type_plugin, model)
-        self.training_type_plugin.setup(model)
+        # self.training_type_plugin.setup(model)
         self.setup_optimizers(trainer, model)
         self.connect_precision_plugin(self.precision_plugin)
 
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 801015afaff79..586597656bb30 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -139,7 +139,6 @@ def model_to_device(self):
     def connect(self, model: torch.nn.Module):
         self._model = model
         self.model_to_device()
-
         return self.model
 
     @property
@@ -180,7 +179,6 @@ def setup(self, model):
 
     def connect(self, model):
         self.setup(model)
-
         return self.model
 
     @property

From 8be82a43ebf3dbb194ca017c44f5ae19bb73895a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 9 Dec 2020 00:36:34 +0100
Subject: [PATCH 032/157] clean up

---
 pytorch_lightning/accelerators/accelerator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 8f38c70d69cc0..a1eb3f4db1d12 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -30,7 +30,6 @@ def __init__(
 
     def setup(self, trainer, model):
         self.connect_training_type_plugin(self.training_type_plugin, model)
-        # self.training_type_plugin.setup(model)
         self.setup_optimizers(trainer, model)
         self.connect_precision_plugin(self.precision_plugin)
 

From 08ce7d323a143d1ffb1f46c15e81f477840658f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 9 Dec 2020 00:40:03 +0100
Subject: [PATCH 033/157] post

---
 pytorch_lightning/trainer/trainer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 6582fa6421c80..8ca7bedd76ccb 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -514,6 +514,8 @@ def fit(
         # self.accelerator_backend = self.accelerator_connector.select_accelerator()
         self.call_hook("on_before_accelerator_backend_setup", model)
         self.accelerator_backend.setup(self, model)
+
+        # TODO: is calling pre-training the correct place here @justus?
         self.training_type_plugin.pre_training()
 
         # ----------------------------
@@ -537,6 +539,8 @@ def fit(
         else:
             results = self.train()
 
+        # TODO: is calling post training the correct place here @justus?
+        self.training_type_plugin.post_training(results, self.checkpoint_callback.best_model_path)
         self.accelerator_backend.teardown()
 
         # ----------------------------

From ffbcd4fa80d34f792f5905093f764a5ab4bf7649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 9 Dec 2020 02:45:49 +0100
Subject: [PATCH 034/157] disable progress bar on rank > 0

---
 pytorch_lightning/accelerators/data_parallel.py | 13 -------------
 pytorch_lightning/trainer/training_loop.py      |  3 +++
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 586597656bb30..3946109fc2a13 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -382,20 +382,12 @@ def pre_training(self):
         if seed is not None:
             seed_everything(int(seed))
 
-        # show progressbar only on progress_rank 0
-        # TODO: check where to move this. Cannot stay here, since we won't have access to progressbar here
-        # if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None:
-        #     self.trainer.progress_bar_callback.disable()
-
         # determine which process we are and world size
         self.set_world_ranks()
 
         # set warning rank
         rank_zero_only.rank = self.global_rank
 
-        # TODO: This has to be done somewhere else!
-        # self.model.trainer = self.trainer
-
         # set up server using proc 0's ip address
         # try to init for 20 times at max in case ports are taken
         # where to store ip_table
@@ -412,7 +404,6 @@ def pre_training(self):
             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
             log.info("-" * 100)
 
-        # TODO: I moved this from training loop to here, is it the right place?
         # set the ranks and devices
         self.dist.rank = self.global_rank
         self.dist.device = self.root_device
@@ -422,10 +413,6 @@ def pre_training(self):
         # move the model to the correct device
         self.model_to_device()
 
-        # TODO: Check where this can be moved
-        # set model properties before going into wrapper
-        # self.trainer.model_connector.copy_trainer_model_properties(self.model)
-
         self.configure_ddp()
 
         self.barrier()
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index e8aefb53ad699..8a69046752088 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -157,6 +157,9 @@ def setup_training(self, model: LightningModule):
         # register auto-resubmit when on SLURM
         self.trainer.slurm_connector.register_slurm_signal_handlers()
 
+        if not self.trainer.is_global_zero and self.trainer.progress_bar_callback is not None:
+            self.trainer.progress_bar_callback.disable()
+
         # --------------------------
         # Pre-train
         # --------------------------

From 4be76bf7a480bdb1b15fe15f20cf42397c501b1c Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Thu, 10 Dec 2020 08:45:34 +0100
Subject: [PATCH 035/157] precision test

---
 pytorch_lightning/accelerators/accelerator.py |  4 +-
 pytorch_lightning/accelerators/precision.py   | 40 +++++++++++++++----
 pytorch_lightning/trainer/training_loop.py    |  6 ++-
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index a1eb3f4db1d12..567badcd70c32 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -100,8 +100,8 @@ def validation_step_end(self, output):
     def process_dataloader(self, dataloader):
         return dataloader
 
-    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
-        return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, *args, **kwargs)
+    def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs):
+        return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
 
     def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
         # TODO: Check out if this can be simplified with new LightningOptimizer!
diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
index d0db65fa12dbb..9733aadf96a33 100644
--- a/pytorch_lightning/accelerators/precision.py
+++ b/pytorch_lightning/accelerators/precision.py
@@ -33,7 +33,16 @@ def master_params(self, optimizer):
     def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
         return model, optimizers, lr_schedulers
 
-    def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs):
+    def backward(
+        self,
+        model: LightningModule,
+        closure_loss: torch.Tensor,
+        optimizer: torch.optim.Optimizer,
+        opt_idx: int,
+        should_accumulate: bool,
+        *args,
+        **kwargs,
+    ):
         # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
         automatic_optimization = model.automatic_optimization
 
@@ -70,7 +79,16 @@ def pre_optimizer_step(self, optimizer, optimizer_idx):
     def post_optimizer_step(self, optimizer, optimizer_idx):
         self.scaler.update()
 
-    def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs):
+    def backward(
+        self,
+        model: LightningModule,
+        closure_loss: torch.Tensor,
+        optimizer: torch.optim.Optimizer,
+        opt_idx: int,
+        should_accumulate: bool,
+        *args,
+        **kwargs,
+    ):
         closure_loss = self.scaler.scale(closure_loss)
 
         # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
@@ -79,8 +97,7 @@ def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *ar
         closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, *args, **kwargs)
 
         # unscale gradient to allow analyze within `on_after_backward`
-        # TODO: Check from where we can get the should_accumulate value (maybe pass it as argument?)
-        if not self.trainer.train_loop.should_accumulate() and automatic_optimization:
+        if not should_accumulate and automatic_optimization:
             self.scaler.unscale_(optimizer)
 
         return closure_loss
@@ -100,7 +117,16 @@ def connect(self, model, optimizers, lr_schedulers):
         reinit_scheduler_properties(optimizers, lr_schedulers)
         return model, optimizers, lr_schedulers
 
-    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+    def backward(
+        self,
+        model: LightningModule,
+        closure_loss: torch.Tensor,
+        optimizer: torch.optim.Optimizer,
+        opt_idx: int,
+        should_accumulate: bool,
+        *args,
+        **kwargs,
+    ):
         closure_loss = amp.scale_loss(closure_loss, optimizer)
 
         # enter apex context
@@ -108,8 +134,8 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         closure_loss = closure_loss.__enter__()
 
         # do backward pass
-        if self.lightning_module:
-            model = self.trainer.get_model()
+        # TODO: not entirely sure, why we need this
+        if model is not None and isinstance(model, LightningModule):
             model.backward(closure_loss, optimizer, opt_idx)
         else:
             closure_loss.backward(*args, **kwargs)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 8a69046752088..3b9e704f840b8 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -818,12 +818,14 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer,
     def backward(self, result, optimizer, opt_idx, *args, **kwargs):
         self.trainer.dev_debugger.track_event("backward_call")
 
+        should_accumulate = self.should_accumulate()
+
         # backward can be called manually in the training loop
         if isinstance(result, torch.Tensor):
-            self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, *args, **kwargs)
+            self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs)
         else:
             result.closure_loss = self.trainer.accelerator_backend.backward(
-                result.closure_loss, optimizer, opt_idx, *args, **kwargs
+                result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
             )
 
         if not self.should_accumulate():

From 098f6650fd43dfa0ebb244c5671d62bce1ee75c2 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Thu, 10 Dec 2020 10:01:30 +0100
Subject: [PATCH 036/157] fix native amp

---
 pytorch_lightning/accelerators/accelerator.py | 39 ++++++--
 .../accelerators/accelerator_connector.py     | 40 ++++++++-
 pytorch_lightning/accelerators/precision.py   |  2 +-
 pytorch_lightning/trainer/trainer.py          | 90 ++++++++++++-------
 4 files changed, 127 insertions(+), 44 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 567badcd70c32..722328dd66325 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,7 +1,7 @@
 from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin
 from pytorch_lightning.accelerators.base_plugin import Plugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities import AMPType, NATIVE_AMP_AVALAIBLE
 from typing import Any, Union
 import math
 
@@ -9,13 +9,17 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning.core import LightningModule
-from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin, PrecisionPlugin
+from pytorch_lightning.accelerators.precision import (
+    ApexMixedPrecisionPlugin,
+    MixedPrecisionPlugin,
+    NativeMixedPrecisionPlugin,
+    PrecisionPlugin,
+)
 
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 
 
 class NewAccelerator(object):
-
     def __init__(
         self,
         precision_plugin: PrecisionPlugin,
@@ -101,14 +105,18 @@ def process_dataloader(self, dataloader):
         return dataloader
 
     def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs):
-        return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
+        return self.precision_plugin.backward(
+            self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
+        )
 
     def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
         # TODO: Check out if this can be simplified with new LightningOptimizer!
 
         model_ref = self.lightning_module
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
-        native_amp = isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.NATIVE
+        native_amp = (
+            isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.NATIVE
+        )
 
         self.precision_plugin.pre_optimizer_step(optimizer, opt_idx)
 
@@ -138,7 +146,7 @@ def clip_gradients(self, optimizer, clip_val):
     def _clip_gradients(self, optimizer, grad_clip_val):
         if grad_clip_val is None:
             return
-        
+
         grad_clip_val = float(grad_clip_val)
 
         if grad_clip_val <= 0:
@@ -209,6 +217,25 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin):
     def to_device(self, batch):
         return self.batch_to_device(batch, self.root_device)
 
+    @property
+    def amp_backend(self):
+        if isinstance(self.precision_plugin, ApexMixedPrecisionPlugin):
+            return AMPType.APEX
+        elif isinstance(self.precision_plugin, NativeMixedPrecisionPlugin):
+            return AMPType.NATIVE
+        else:
+            return None
+
+    @property
+    def precision(self):
+        return self.precision_plugin.precision
+
+    @property
+    def scaler(self):
+        if hasattr(self.precision_plugin, 'scaler'):
+            return self.precision_plugin.scaler
+
+        return None
 
 
 class NewCPUAccelerator(NewAccelerator):
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index a9327c87138ed..0dd945a4a0fa5 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -19,8 +19,8 @@
 
 from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin
-from pytorch_lightning.accelerators.precision import PrecisionPlugin
-from pytorch_lightning.utilities import device_parser
+from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
+from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -57,6 +57,9 @@ def __init__(
         benchmark,
         replace_sampler_ddp,
         deterministic,
+        precision,
+        amp_type, 
+        amp_level
     ):
 
         # initialization
@@ -77,6 +80,9 @@ def __init__(
         self.benchmark = benchmark
         self.replace_sampler_ddp = replace_sampler_ddp
         self.deterministic = deterministic
+        self.precision = precision
+        self.amp_type = None if amp_type is None else amp_type.lower()
+        self.amp_level = amp_level
 
         # init the default rank if exists
         # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
@@ -143,7 +149,35 @@ def num_gpus(self) -> int:
         return len(gpus)
 
     def select_precision_plugin(self):
-        return PrecisionPlugin()
+        if self.precision == 32:
+            self.amp_type = None
+            return PrecisionPlugin()
+
+        elif self.precision == 16:
+            if self.amp_type == 'native':
+                if not NATIVE_AMP_AVALAIBLE:
+                    rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.'
+                                ' Consider upgrading with `pip install torch>=1.6`.'
+                                ' We will attempt to use NVIDIA Apex for this session.')
+                    self.amp_type = 'apex'
+                else:
+                    log.info('Using native 16bit precision.')
+                    self.amp_type = AMPType.NATIVE
+                    return NativeMixedPrecisionPlugin()
+
+            if self.amp_type =='apex':
+                if not APEX_AVAILABLE:
+                    rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.'
+                                ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux')
+                else:
+                    log.info('Using APEX 16bit precision.')
+                    self.amp_type = AMPType.APEX
+                    return ApexMixedPrecisionPlugin(self.amp_level)
+
+
+        
+        else:
+            raise NotImplementedError('We only support precisions 32 and 16!')
 
     def select_training_type_plugin(self):
         if self.distributed_backend == "ddp":
diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
index 9733aadf96a33..3ce68c8e1efc6 100644
--- a/pytorch_lightning/accelerators/precision.py
+++ b/pytorch_lightning/accelerators/precision.py
@@ -94,7 +94,7 @@ def backward(
         # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
         automatic_optimization = model.automatic_optimization
 
-        closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, *args, **kwargs)
+        closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
 
         # unscale gradient to allow analyze within `on_after_backward`
         if not should_accumulate and automatic_optimization:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 8ca7bedd76ccb..bf07c17727d59 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -15,6 +15,7 @@
 """Trainer to automate the training."""
 
 import os
+from pytorch_lightning.accelerators.precision import PrecisionPlugin
 import warnings
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Union
@@ -56,6 +57,7 @@
 from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
 from pytorch_lightning.trainer.properties import TrainerProperties
 from pytorch_lightning.trainer.states import RunningStage, TrainerState
+from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
@@ -80,8 +82,9 @@
 
 # warnings to ignore in trainer
 warnings.filterwarnings(
-    'ignore', message='torch.distributed.reduce_op is deprecated, ' 'please use torch.distributed.ReduceOp instead'
+    "ignore", message="torch.distributed.reduce_op is deprecated, " "please use torch.distributed.ReduceOp instead"
 )
+os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning"
 
 
 class Trainer(
@@ -128,7 +131,7 @@ def __init__(
         accelerator: Optional[Union[str, NewAccelerator]] = None,
         sync_batchnorm: bool = False,
         precision: int = 32,
-        weights_summary: Optional[str] = 'top',
+        weights_summary: Optional[str] = "top",
         weights_save_path: Optional[str] = None,
         num_sanity_val_steps: int = 2,
         truncated_bptt_steps: Optional[int] = None,
@@ -330,10 +333,13 @@ def __init__(
             benchmark,
             replace_sampler_ddp,
             deterministic,
+            precision,
+            amp_backend,
+            amp_level,
         )
         self.logger_connector = LoggerConnector(self)
         self.model_connector = ModelConnector(self)
-        self.precision_connector = PrecisionConnector(self)
+        # self.precision_connector = PrecisionConnector(self)
         self.callback_connector = CallbackConnector(self)
         self.debugging_connector = DebuggingConnector(self)
         self.training_tricks_connector = TrainingTricksConnector(self)
@@ -438,7 +444,7 @@ def __init__(
         )
 
         # set precision
-        self.precision_connector.on_trainer_init(precision, amp_level, amp_backend)
+        # self.precision_connector.on_trainer_init(precision, amp_level, amp_backend)
 
         # last thing are the plugins which override whatever the trainer used by default
         self.plugin_connector.on_trainer_init(plugins)
@@ -470,6 +476,18 @@ def optimizer_frequencies(self):
     def optimizer_frequencies(self, new_freqs):
         self.accelerator_backend.optimizer_frequencies = new_freqs
 
+    @property
+    def amp_backend(self):
+        return self.accelerator_backend.amp_backend
+
+    @property
+    def precision(self):
+        return self.accelerator_backend.precision
+
+    @property
+    def scaler(self):
+        return self.accelerator_backend.scaler
+
     def fit(
         self,
         model: LightningModule,
@@ -506,7 +524,7 @@ def fit(
 
         # bookkeeping
         # we reuse fit in .test() but change its behavior using this flag
-        self.testing = os.environ.get('PL_TESTING_MODE', self.testing)
+        self.testing = os.environ.get("PL_TESTING_MODE", self.testing)
 
         # ----------------------------
         # SET UP TRAINING
@@ -532,7 +550,7 @@ def fit(
         # TRAIN
         # ----------------------------
         # hook
-        self.call_hook('on_fit_start')
+        self.call_hook("on_fit_start")
 
         if self.testing:
             results = self.run_test()
@@ -547,12 +565,12 @@ def fit(
         # POST-Training CLEAN UP
         # ----------------------------
         # hook
-        self.call_hook('on_fit_end')
+        self.call_hook("on_fit_end")
 
         # hook
-        self.teardown('fit')
-        if self.is_function_implemented('teardown'):
-            model.teardown('fit')
+        self.teardown("fit")
+        if self.is_function_implemented("teardown"):
+            model.teardown("fit")
 
         # return 1 when finished
         # used for testing or when we need to know that training succeeded
@@ -597,7 +615,7 @@ def train(self):
                     return
 
                 # update LR schedulers
-                self.optimizer_connector.update_learning_rates(interval='epoch')
+                self.optimizer_connector.update_learning_rates(interval="epoch")
 
                 # early stopping
                 met_min_epochs = epoch >= self.min_epochs - 1
@@ -606,14 +624,18 @@ def train(self):
                 if self.should_stop:
                     if met_min_epochs and met_min_steps:
                         return
-                    log.info(
-                        'Trainer was signaled to stop but required minimum epochs'
-                        f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
-                        ' not been met. Training will continue...'
-                    )
+                    else:
+                        log.info(
+                            "Trainer was signaled to stop but required minimum epochs"
+                            f" ({self.min_epochs}) or minimum steps ({self.min_steps}) has"
+                            " not been met. Training will continue..."
+                        )
+
+            # hook
+            self.train_loop.on_train_end()
 
         except KeyboardInterrupt:
-            rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...')
+            rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
 
             # user could press ctrl+c many times... only shutdown once
             if not self.interrupted:
@@ -744,7 +766,7 @@ def run_test(self):
         return eval_loop_results
 
     def run_sanity_check(self, ref_model):
-        using_val_step = ref_model.val_dataloader is not None and is_overridden('validation_step', ref_model)
+        using_val_step = ref_model.val_dataloader is not None and is_overridden("validation_step", ref_model)
         should_sanity_check = using_val_step and self.num_sanity_val_steps > 0 and self.limit_val_batches > 0
 
         # run tiny validation (if validation defined)
@@ -781,7 +803,7 @@ def test(
         self,
         model: Optional[LightningModule] = None,
         test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
-        ckpt_path: Optional[str] = 'best',
+        ckpt_path: Optional[str] = "best",
         verbose: bool = True,
         datamodule: Optional[LightningDataModule] = None,
     ):
@@ -815,18 +837,18 @@ def test(
         # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
         if test_dataloaders and datamodule:
             raise MisconfigurationException(
-                'You cannot pass test_dataloaders to trainer.test if you supply a datamodule'
+                "You cannot pass test_dataloaders to trainer.test if you supply a datamodule"
             )
 
         # Attach datamodule to get setup/prepare_data added to model before the call to it below
-        self.data_connector.attach_datamodule(model or self.get_model(), datamodule, 'test')
+        self.data_connector.attach_datamodule(model or self.get_model(), datamodule, "test")
 
         if model is not None:
             results = self.__test_given_model(model, test_dataloaders)
         else:
             results = self.__test_using_best_weights(ckpt_path, test_dataloaders)
 
-        self.teardown('test')
+        self.teardown("test")
 
         return results
 
@@ -834,7 +856,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
         model = self.get_model()
 
         # if user requests the best checkpoint but we don't have it, error
-        if ckpt_path == 'best' and not self.checkpoint_callback.best_model_path:
+        if ckpt_path == "best" and not self.checkpoint_callback.best_model_path:
             raise MisconfigurationException(
                 'ckpt_path is "best", but ModelCheckpoint is not configured to save the best model.'
             )
@@ -842,20 +864,20 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
         # load best weights
         if ckpt_path is not None:
             # ckpt_path is 'best' so load the best model
-            if ckpt_path == 'best':
+            if ckpt_path == "best":
                 ckpt_path = self.checkpoint_callback.best_model_path
 
             if len(ckpt_path) == 0:
                 rank_zero_warn(
-                    f'.test() found no path for the best weights, {ckpt_path}. Please '
-                    f'specify a path for a checkpoint .test(ckpt_path=PATH)'
+                    f".test() found no path for the best weights, {ckpt_path}. Please "
+                    f"specify a path for a checkpoint .test(ckpt_path=PATH)"
                 )
                 return {}
             if self.accelerator_backend is not None and not self._device_type == DeviceType.TPU:
                 self.accelerator_backend.barrier()
 
             ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
-            model.load_state_dict(ckpt['state_dict'])
+            model.load_state_dict(ckpt["state_dict"])
 
         # attach dataloaders
         if test_dataloaders is not None:
@@ -864,16 +886,16 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
         # run tests
         self.tested_ckpt_path = ckpt_path
         self.testing = True
-        os.environ['PL_TESTING_MODE'] = '1'
+        os.environ["PL_TESTING_MODE"] = "1"
         self.model = model
         results = self.fit(model)
         self.testing = False
-        del os.environ['PL_TESTING_MODE']
+        del os.environ["PL_TESTING_MODE"]
 
         # teardown
-        if self.is_function_implemented('teardown'):
+        if self.is_function_implemented("teardown"):
             model_ref = self.get_model()
-            model_ref.teardown('test')
+            model_ref.teardown("test")
 
         return results
 
@@ -891,8 +913,8 @@ def __test_given_model(self, model, test_dataloaders):
         self.testing = False
 
         # teardown
-        if self.is_function_implemented('teardown'):
-            model.teardown('test')
+        if self.is_function_implemented("teardown"):
+            model.teardown("test")
 
         return results
 
@@ -922,7 +944,7 @@ def tune(
 
     def call_setup_hook(self, model):
         # call setup after the ddp process has connected
-        stage_name = 'test' if self.testing else 'fit'
+        stage_name = "test" if self.testing else "fit"
         if self.datamodule is not None:
             called = self.datamodule.has_setup_test if self.testing else self.datamodule.has_setup_fit
             if not called:

From ea856333b6b1f38c8e275ab080f528da1dfac5bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 12 Dec 2020 06:54:42 +0100
Subject: [PATCH 037/157] a


From 846dc92ea535d2367c80a6eae2e1e28344fa32f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 12 Dec 2020 08:03:29 +0100
Subject: [PATCH 038/157] ddp spawn

---
 .../accelerators/accelerator_connector.py     |  10 +-
 .../accelerators/data_parallel.py             | 382 ++++++++++--------
 2 files changed, 221 insertions(+), 171 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 0dd945a4a0fa5..6c23caede81a9 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -18,7 +18,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
-from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin
+from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin
 from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
@@ -188,6 +188,14 @@ def select_training_type_plugin(self):
                 cluster_environment=TorchElasticEnvironment(),  # TODO: deterimine this using plugin connector?
                 is_slurm_managing_tasks=False,  # TODO: determine this
             )
+        elif self.use_ddp and self.distributed_backend == "ddp_spawn":
+            plugin = DDPSpawnPlugin(
+                parallel_device_ids=self.parallel_devices,
+                num_nodes=self.num_nodes,
+                logger=None,
+                cluster_environment=TorchElasticEnvironment(),
+                is_slurm_managing_tasks=False,  # TODO: determine this
+            )
         else:
             # TODO: cover all other cases
             plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 3946109fc2a13..8e2420da82c76 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -185,6 +185,23 @@ def connect(self, model):
     def is_global_zero(self) -> bool:
         return self.global_rank == 0
 
+    @staticmethod
+    def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
+        """
+        Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override to synchronize batchnorm between specific process groups instead
+        of the whole world or use a different sync_bn like `apex`'s version.
+
+        Args:
+            model: pointer to current :class:`LightningModule`.
+
+        Return:
+            LightningModule with batchnorm layers synchronized between process groups
+        """
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        return model
+
 
 class DataParallelPlugin(ParallelPlugin):
     def setup(self, model):
@@ -423,24 +440,6 @@ def post_training(self, results, best_model_path):
         if "WORLD_SIZE" in os.environ:
             del os.environ["WORLD_SIZE"]
 
-    @staticmethod
-    def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
-        """
-        Add global batchnorm for a model spread across multiple GPUs and nodes.
-
-        Override to synchronize batchnorm between specific process groups instead
-        of the whole world or use a different sync_bn like `apex`'s version.
-
-        Args:
-            model: pointer to current :class:`LightningModule`.
-
-        Return:
-            LightningModule with batchnorm layers synchronized between process groups
-        """
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
-
-        return model
-
     def barrier(self, *args, **kwargs):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()
@@ -460,168 +459,211 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
         return output
 
 
-# class DDPSpawnPlugin(ParallelPlugin):
-#     def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0):
-#         super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
-#         self.process_idx = None
+class DDPSpawnPlugin(ParallelPlugin):
+
+    distributed_backend = "ddp_spawn"
 
-#         self.dist = LightningDistributed()
-#         # TODO: how to get in nprocs? probably pass it
-#         self.num_processes = num_processes
-#         self.mp_queue = None
-#         self.proc_offset = proc_offset
+    def __init__(
+        self,
+        parallel_device_ids,
+        num_nodes=1,
+        logger=None,
+        cluster_environment=None,
+        is_slurm_managing_tasks=False,
+        proc_offset=0,
+        **kwargs: Dict[str, Any]
+    ):
+        super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
+        self.num_nodes = num_nodes
+        self.is_slurm_managing_tasks = is_slurm_managing_tasks
+        self.proc_offset = proc_offset
+        self._ddp_kwargs = kwargs
+        self.process_idx = None
+        self.dist = LightningDistributed()
+        self.num_processes = len(parallel_device_ids)
+        self.mp_queue = None
 
-#     def setup(self, model):
-#         os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
+    @property
+    def root_device(self):
+        return torch.device("cuda", self.parallel_device_ids[self.local_rank])
+
+    def setup(self, model):
+        self._model = model
 
-#         # pass in a state q
-#         smp = mp.get_context('spawn')
-#         self.mp_queue = smp.SimpleQueue()
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
 
-#     def set_world_ranks(self):
-#         self.local_rank = self.process_idx
-#         # check from where we get node_rank, num_processes and num_nodes
-#         self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx
-#         self.world_size = self.num_nodes * self.num_processes
+        # pass in a state q
+        smp = mp.get_context('spawn')
+        self.mp_queue = smp.SimpleQueue()
 
-#     def pre_training(self):
+    def set_world_ranks(self):
+        self.local_rank = self.process_idx
+        # check from where we get node_rank, num_processes and num_nodes
+        self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx
+        self.world_size = self.num_nodes * self.num_processes
 
-#         # TODO: Check if current process can be used as one training proc
-#         # start from one since current process is proc 0
-#         for proc_idx in range(1, self.num_processes):
-#             # use os.fork, since this enables us to continue from here 
-#             # instead of spawning with separate function
-#             pid = os.fork()
+    def pre_training(self):
 
-#             # set in child processes (PID=0). All previous child processes 
-#             # should already have their process_idx assigned
-#             if pid == 0 and self.process_idx is None:
-#                 self.process_idx = proc_idx + self.proc_offset
+        # TODO: Check if current process can be used as one training proc
+        # start from one since current process is proc 0
+        for proc_idx in range(1, self.num_processes):
+            # use os.fork, since this enables us to continue from here
+            # instead of spawning with separate function
+            pid = os.fork()
 
-#         # set process idx for current process
-#         if pid != 0:
-#             self.process_idx = 0 + self.proc_offset
+            # set in child processes (PID=0). All previous child processes
+            # should already have their process_idx assigned
+            if pid == 0 and self.process_idx is None:
+                self.process_idx = proc_idx + self.proc_offset
 
-#         # TODO: Check where to put that since we don't have access to the pbar here
-#         # show progressbar only on progress_rank 0
-#         # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
-#         #     self.trainer.progress_bar_callback.disable()
+        # set process idx for current process
+        if pid != 0:
+            self.process_idx = 0 + self.proc_offset
 
-#         self.set_world_ranks()
+        # TODO: Check where to put that since we don't have access to the pbar here
+        # show progressbar only on progress_rank 0
+        # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
+        #     self.trainer.progress_bar_callback.disable()
 
-#         # set warning rank
-#         rank_zero_only.rank = self.global_rank
-    
-#         # TODO: This has to be done somewhere else!
-#         # self.model.trainer = self.trainer
-
-#         # set up server using proc 0's ip address
-#         # try to init for 20 times at max in case ports are taken
-#         # where to store ip_table
-#         # TODO: CHeck is_slurm_managing_tasks
-#         self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks)
-
-#         # TODO: Move this somewhere else
-#         # self.trainer.call_setup_hook(self.model)
-
-#         # on world_size=0 let everyone know training is starting
-#         if self.is_global_zero and not torch.distributed.is_initialized():
-#             log.info("-" * 100)
-#             log.info(f"distributed_backend={self.distributed_backend}")
-#             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
-#             log.info("-" * 100)
-
-#         self.model = self.configure_sync_batchnorm(self.model)
-
-#         # move the model to the correct device
-#         self.model_to_device()
-
-#         # TODO: Check where this can be moved
-#         # set model properties before going into wrapper
-#         # self.trainer.model_connector.copy_trainer_model_properties(self.model)
-
-#         self.configure_ddp()
-
-#         self.barrier()
-
-#     def post_training(self, results, best_model_path):
-#         # get original model
-#         # TODO: How To get this? is this simply self.model?
-#         # model = self.trainer.get_model()
-#         model = self.model
-
-#         # persist info in ddp_spawn
-#         self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path)
-
-#         # clean up memory
-#         torch.cuda.empty_cache()
-
-#         if self.process_idx == 0:
-#             # restore main state with best weights
-#             best_path = self.mp_queue.get()
-#             results = self.mp_queue.get()
-#             last_path = self.mp_queue.get()
-
-#             # recover the weights of the processes trained in the children
-#             self.__recover_child_process_weights(model, best_path, last_path)
-
-#     def configure_ddp(self):
-#         # if unset, default `find_unused_parameters` `True`
-#         self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-#         self.model = LightningDistributedDataParallel(
-#             self.model,
-#             device_ids=self.determine_ddp_device_ids(),
-#             **self._ddp_kwargs,
-#         )
-
-#     def determine_ddp_device_ids(self):
-#         return [self.root_device]
-
-#     def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None):
-
-#         if self.global_rank == 0 and self.mp_queue is not None:
-#             rank_zero_warn('cleaning up ddp environment...')
-#             # todo, pass complete checkpoint as state dictionary
-#             self.mp_queue.put(best_model_path)
-#             self.mp_queue.put(results)
-
-#             # save the last weights
-#             last_path = None
-#             # TODO: From where to get self.trainer.testing?
-#             # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
-#             if best_model_path is not None and len(best_model_path) > 0:
-#                 last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
-#                 atomic_save(self.model.state_dict(), last_path)
-#             self.mp_queue.put(last_path)
-
-
-#     def __recover_child_process_weights(self, model, best_path, last_path):
-#         # TODO: Where can we set this?
-#         # transfer back the best path to the trainer
-#         # if self.trainer.checkpoint_callback:
-#         #     self.trainer.checkpoint_callback.best_model_path = best_path
-#         # todo, pass also best score
-
-#         # load last weights
-#         # TODO: How to get self.trainer.testing?
-#         if last_path is not None: # and not self.trainer.testing:
-#             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
-#             model.load_state_dict(ckpt)
-
-#         # TODO: Where to set this?
-#         # Do we really need to set this or can we just make the trainer property forward our current property here?
-#         # self.trainer.model = model
-
-#     def determine_local_rank(self):
-#         if self.is_slurm_managing_tasks:
-#             return int(os.environ['SLURM_LOCALID'])
-#         else:
-#             return super().determine_node_rank()
-
-#     def determine_node_rank(self):
-#         if self.is_slurm_managing_tasks:
-#             return int(os.environ['SLURM_NODEID'])
-#         else:
-#             return super().determine_node_rank()
+        self.set_world_ranks()
+
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        # TODO: CHeck is_slurm_managing_tasks
+        self.init_ddp_connection(self.global_rank, self.world_size)
+
+        # TODO: Move this somewhere else
+        # self.trainer.call_setup_hook(self.model)
+
+        # on world_size=0 let everyone know training is starting
+        if self.is_global_zero and not torch.distributed.is_initialized():
+            log.info("-" * 100)
+            log.info(f"distributed_backend={self.distributed_backend}")
+            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
+            log.info("-" * 100)
+
+        self.model = self.configure_sync_batchnorm(self.model)
+
+        # move the model to the correct device
+        self.model_to_device()
+
+        # TODO: Check where this can be moved
+        # set model properties before going into wrapper
+        # self.trainer.model_connector.copy_trainer_model_properties(self.model)
+
+        self.configure_ddp()
+
+        self.barrier()
+
+    def post_training(self, results, best_model_path):
+        # get original model
+        # TODO: How To get this? is this simply self.model?
+        # model = self.trainer.get_model()
+        model = self.model
+
+        # persist info in ddp_spawn
+        self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path)
+
+        # clean up memory
+        torch.cuda.empty_cache()
+
+        if self.process_idx == 0:
+            # restore main state with best weights
+            best_path = self.mp_queue.get()
+            results = self.mp_queue.get()
+            last_path = self.mp_queue.get()
+
+            # recover the weights of the processes trained in the children
+            self.__recover_child_process_weights(model, best_path, last_path)
+
+    def configure_ddp(self):
+        # if unset, default `find_unused_parameters` `True`
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
+        self.model = LightningDistributedDataParallel(
+            self.model,
+            device_ids=self.determine_ddp_device_ids(),
+            **self._ddp_kwargs,
+        )
+
+    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
+        # TODO: this code is duplicated in DDP and DDPSpawn, make this a function
+        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
+        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
+        torch_backend = "nccl" if self.on_gpu else "gloo"
+
+        if not torch.distributed.is_initialized():
+            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
+            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
+
+    def determine_ddp_device_ids(self):
+        return [self.root_device]
+
+    def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None):
+
+        if self.global_rank == 0 and self.mp_queue is not None:
+            rank_zero_warn('cleaning up ddp environment...')
+            # todo, pass complete checkpoint as state dictionary
+            self.mp_queue.put(best_model_path)
+            self.mp_queue.put(results)
+
+            # save the last weights
+            last_path = None
+            # TODO: From where to get self.trainer.testing?
+            # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+            if best_model_path is not None and len(best_model_path) > 0:
+                last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
+                atomic_save(self.model.state_dict(), last_path)
+            self.mp_queue.put(last_path)
+
+
+    def __recover_child_process_weights(self, model, best_path, last_path):
+        # TODO: Where can we set this?
+        # transfer back the best path to the trainer
+        # if self.trainer.checkpoint_callback:
+        #     self.trainer.checkpoint_callback.best_model_path = best_path
+        # todo, pass also best score
+
+        # load last weights
+        # TODO: How to get self.trainer.testing?
+        if last_path is not None: # and not self.trainer.testing:
+            ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt)
+
+        # TODO: Where to set this?
+        # Do we really need to set this or can we just make the trainer property forward our current property here?
+        # self.trainer.model = model
+
+    def determine_local_rank(self):
+        if self.is_slurm_managing_tasks:
+            return int(os.environ['SLURM_LOCALID'])
+        else:
+            return super().determine_node_rank()
+
+    def determine_node_rank(self):
+        if self.is_slurm_managing_tasks:
+            return int(os.environ['SLURM_NODEID'])
+        else:
+            return super().determine_node_rank()
+
+    def barrier(self, *args, **kwargs):
+        if torch_distrib.is_initialized():
+            torch_distrib.barrier()
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return self.dist.broadcast(obj)
+
+    def model_to_device(self):
+        torch.cuda.set_device(self.root_device)
+        self.model.to(self.root_device)
+
+    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
+        if isinstance(output, torch.Tensor):
+            output = sync_ddp_if_available(output, group, reduce_op)
+        return output
 
 # STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP
\ No newline at end of file

From 0d0c3d718975f0fa4ae49bde0b2c5850ab06e28c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 12 Dec 2020 08:15:48 +0100
Subject: [PATCH 039/157] spawn

---
 .../accelerators/data_parallel.py             | 55 ++++++++++++-------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 8e2420da82c76..231da55fbfe10 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -478,7 +478,6 @@ def __init__(
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
         self.proc_offset = proc_offset
         self._ddp_kwargs = kwargs
-        self.process_idx = None
         self.dist = LightningDistributed()
         self.num_processes = len(parallel_device_ids)
         self.mp_queue = None
@@ -496,36 +495,50 @@ def setup(self, model):
         smp = mp.get_context('spawn')
         self.mp_queue = smp.SimpleQueue()
 
-    def set_world_ranks(self):
-        self.local_rank = self.process_idx
+    def set_world_ranks(self, process_idx):
+        self.local_rank = process_idx
         # check from where we get node_rank, num_processes and num_nodes
-        self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx
+        self.global_rank = self.determine_node_rank() * self.num_processes + process_idx
         self.world_size = self.num_nodes * self.num_processes
 
     def pre_training(self):
-
-        # TODO: Check if current process can be used as one training proc
-        # start from one since current process is proc 0
-        for proc_idx in range(1, self.num_processes):
-            # use os.fork, since this enables us to continue from here
-            # instead of spawning with separate function
-            pid = os.fork()
-
-            # set in child processes (PID=0). All previous child processes
-            # should already have their process_idx assigned
-            if pid == 0 and self.process_idx is None:
-                self.process_idx = proc_idx + self.proc_offset
-
-        # set process idx for current process
-        if pid != 0:
-            self.process_idx = 0 + self.proc_offset
+        mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, self.model, self.proc_offset,))
+
+        print(self.global_rank, "I am still running", os.getpid(),
+              "i will go into training loop and crash because i didn't enter process group")
+
+    def new_process(self, process_idx, mp_queue, model, proc_offset):
+        print("i am a new process", os.getpid())
+        # TODO: check if needed
+        # seed = os.environ.get("PL_GLOBAL_SEED")
+        # if seed is not None:
+        #     seed_everything(int(seed))
+
+        # # TODO: Check if current process can be used as one training proc
+        #     No because torch.multiprocessing does not support the fork method in combination with cuda
+        # # start from one since current process is proc 0
+        # for proc_idx in range(1, self.num_processes):
+        #     # use os.fork, since this enables us to continue from here
+        #     # instead of spawning with separate function
+        #     pid = os.fork()
+        #
+        #     # set in child processes (PID=0). All previous child processes
+        #     # should already have their process_idx assigned
+        #     if pid == 0 and self.process_idx is None:
+        #         self.process_idx = proc_idx + self.proc_offset
+        #
+        # # set process idx for current process
+        # if pid != 0:
+        #     self.process_idx = 0 + self.proc_offset
 
         # TODO: Check where to put that since we don't have access to the pbar here
         # show progressbar only on progress_rank 0
         # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
         #     self.trainer.progress_bar_callback.disable()
 
-        self.set_world_ranks()
+        process_idx = process_idx + proc_offset
+
+        self.set_world_ranks(process_idx)
 
         # set warning rank
         rank_zero_only.rank = self.global_rank

From 3fb8b4d07ee1696e262b2a9bf8c8d3a6de262475 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 13 Dec 2020 03:34:55 +0100
Subject: [PATCH 040/157] finish ddp plugin integration

---
 pytorch_lightning/accelerators/base_plugin.py |   2 +-
 .../accelerators/data_parallel.py             | 119 ++++++++----------
 pytorch_lightning/trainer/properties.py       |   4 +
 pytorch_lightning/trainer/trainer.py          |  56 ++++++++-
 pytorch_lightning/trainer/training_loop.py    |  33 -----
 5 files changed, 108 insertions(+), 106 deletions(-)

diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py
index 42b3e1f00b932..549d311f7f87d 100644
--- a/pytorch_lightning/accelerators/base_plugin.py
+++ b/pytorch_lightning/accelerators/base_plugin.py
@@ -15,7 +15,7 @@ def post_optimizer_step(self, optimizer, optimizer_idx):
     def pre_training(self):
         pass
 
-    def post_training(self, results, best_model_path):
+    def post_training(self, best_model_path):
         pass
 
     @contextlib.contextmanager
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 231da55fbfe10..64517273a9ced 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -113,6 +113,14 @@ def model(self, new_model):
     def lightning_module(self):
         return self._model
 
+    def start_training(self, trainer):
+        # double dispatch to initiate the training loop
+        return trainer.train()
+
+    def start_testing(self, trainer):
+        # double dispatch to initiate the test loop
+        return trainer.run_test()
+
 
 class SingleDevicePlugin(TrainingTypePlugin):
     def __init__(self, device, logger=None):
@@ -395,6 +403,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
             torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
 
     def pre_training(self):
+        # TODO: check if needed
         seed = os.environ.get("PL_GLOBAL_SEED")
         if seed is not None:
             seed_everything(int(seed))
@@ -434,7 +443,7 @@ def pre_training(self):
 
         self.barrier()
 
-    def post_training(self, results, best_model_path):
+    def post_training(self, best_model_path):
         torch.cuda.empty_cache()
 
         if "WORLD_SIZE" in os.environ:
@@ -486,6 +495,11 @@ def __init__(
     def root_device(self):
         return torch.device("cuda", self.parallel_device_ids[self.local_rank])
 
+    @property
+    def lightning_module(self):
+        # the model may not be wrapped with DistributedDataParallel if calling this too early
+        return getattr(self._model, "module", self._model)
+
     def setup(self, model):
         self._model = model
 
@@ -501,43 +515,19 @@ def set_world_ranks(self, process_idx):
         self.global_rank = self.determine_node_rank() * self.num_processes + process_idx
         self.world_size = self.num_nodes * self.num_processes
 
-    def pre_training(self):
-        mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, self.model, self.proc_offset,))
+    def start_training(self, trainer):
+        mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,))
 
-        print(self.global_rank, "I am still running", os.getpid(),
-              "i will go into training loop and crash because i didn't enter process group")
+    def start_testing(self, trainer):
+        mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,))
 
-    def new_process(self, process_idx, mp_queue, model, proc_offset):
-        print("i am a new process", os.getpid())
+    def new_process(self, process_idx, mp_queue, trainer, model, proc_offset):
         # TODO: check if needed
-        # seed = os.environ.get("PL_GLOBAL_SEED")
-        # if seed is not None:
-        #     seed_everything(int(seed))
-
-        # # TODO: Check if current process can be used as one training proc
-        #     No because torch.multiprocessing does not support the fork method in combination with cuda
-        # # start from one since current process is proc 0
-        # for proc_idx in range(1, self.num_processes):
-        #     # use os.fork, since this enables us to continue from here
-        #     # instead of spawning with separate function
-        #     pid = os.fork()
-        #
-        #     # set in child processes (PID=0). All previous child processes
-        #     # should already have their process_idx assigned
-        #     if pid == 0 and self.process_idx is None:
-        #         self.process_idx = proc_idx + self.proc_offset
-        #
-        # # set process idx for current process
-        # if pid != 0:
-        #     self.process_idx = 0 + self.proc_offset
-
-        # TODO: Check where to put that since we don't have access to the pbar here
-        # show progressbar only on progress_rank 0
-        # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None:
-        #     self.trainer.progress_bar_callback.disable()
+        seed = os.environ.get("PL_GLOBAL_SEED")
+        if seed is not None:
+            seed_everything(int(seed))
 
         process_idx = process_idx + proc_offset
-
         self.set_world_ranks(process_idx)
 
         # set warning rank
@@ -559,39 +549,39 @@ def new_process(self, process_idx, mp_queue, model, proc_offset):
             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
             log.info("-" * 100)
 
+        # set the ranks and devices
+        self.dist.rank = self.global_rank
+        self.dist.device = self.root_device
+
         self.model = self.configure_sync_batchnorm(self.model)
 
         # move the model to the correct device
         self.model_to_device()
 
-        # TODO: Check where this can be moved
-        # set model properties before going into wrapper
-        # self.trainer.model_connector.copy_trainer_model_properties(self.model)
-
         self.configure_ddp()
 
         self.barrier()
 
-    def post_training(self, results, best_model_path):
-        # get original model
-        # TODO: How To get this? is this simply self.model?
-        # model = self.trainer.get_model()
-        model = self.model
+        if trainer.testing:
+            results = trainer.run_test()
+        else:
+            results = trainer.train()
 
         # persist info in ddp_spawn
-        self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path)
+        self.transfer_distrib_spawn_state_on_fit_end(results)
 
+    def post_training(self, best_model_path):
         # clean up memory
         torch.cuda.empty_cache()
 
-        if self.process_idx == 0:
-            # restore main state with best weights
-            best_path = self.mp_queue.get()
-            results = self.mp_queue.get()
-            last_path = self.mp_queue.get()
+        # restore main state with best weights
+        best_path = self.mp_queue.get()
+        results = self.mp_queue.get()
+        last_path = self.mp_queue.get()
 
-            # recover the weights of the processes trained in the children
-            self.__recover_child_process_weights(model, best_path, last_path)
+        # recover the weights of the processes trained in the children
+        self.__recover_child_process_weights(best_path, last_path)
+        return results
 
     def configure_ddp(self):
         # if unset, default `find_unused_parameters` `True`
@@ -616,7 +606,9 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
     def determine_ddp_device_ids(self):
         return [self.root_device]
 
-    def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None):
+    def transfer_distrib_spawn_state_on_fit_end(self, results):
+        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
+        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path
 
         if self.global_rank == 0 and self.mp_queue is not None:
             rank_zero_warn('cleaning up ddp environment...')
@@ -626,30 +618,24 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_pat
 
             # save the last weights
             last_path = None
-            # TODO: From where to get self.trainer.testing?
-            # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
-            if best_model_path is not None and len(best_model_path) > 0:
+            # TODO: is there a better way than accessing trainer through model -> trainer?
+            if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
                 last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
-                atomic_save(self.model.state_dict(), last_path)
+                atomic_save(self.lightning_module.state_dict(), last_path)
             self.mp_queue.put(last_path)
 
-
-    def __recover_child_process_weights(self, model, best_path, last_path):
-        # TODO: Where can we set this?
+    def __recover_child_process_weights(self, best_path, last_path):
+        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
         # transfer back the best path to the trainer
-        # if self.trainer.checkpoint_callback:
-        #     self.trainer.checkpoint_callback.best_model_path = best_path
+        if self.lightning_module.trainer.checkpoint_callback:
+            self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path
         # todo, pass also best score
 
         # load last weights
         # TODO: How to get self.trainer.testing?
         if last_path is not None: # and not self.trainer.testing:
             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
-            model.load_state_dict(ckpt)
-
-        # TODO: Where to set this?
-        # Do we really need to set this or can we just make the trainer property forward our current property here?
-        # self.trainer.model = model
+            self.lightning_module.load_state_dict(ckpt)
 
     def determine_local_rank(self):
         if self.is_slurm_managing_tasks:
@@ -679,4 +665,5 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
             output = sync_ddp_if_available(output, group, reduce_op)
         return output
 
-# STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP
\ No newline at end of file
+
+# TODO: DDP2 (?), HOROVOD DDP AND HPC DDP
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index cb613dc087691..02844cb1375bd 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -338,6 +338,10 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
     def get_model(self):
         # TODO: rename this to lightning_module (see training type plugin)
         # backward compatible
+        return self.lightning_module
+
+    @property
+    def lightning_module(self):
         return self.training_type_plugin.lightning_module
 
     def __getstate__(self):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index bf07c17727d59..5fd80fadfe751 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -532,8 +532,6 @@ def fit(
         # self.accelerator_backend = self.accelerator_connector.select_accelerator()
         self.call_hook("on_before_accelerator_backend_setup", model)
         self.accelerator_backend.setup(self, model)
-
-        # TODO: is calling pre-training the correct place here @justus?
         self.training_type_plugin.pre_training()
 
         # ----------------------------
@@ -550,15 +548,16 @@ def fit(
         # TRAIN
         # ----------------------------
         # hook
+
         self.call_hook("on_fit_start")
 
+        # double dispatch: let the plugin initiate the training/test loop.
         if self.testing:
-            results = self.run_test()
+            self.training_type_plugin.start_testing(self)
         else:
-            results = self.train()
+            self.training_type_plugin.start_training(self)
 
-        # TODO: is calling post training the correct place here @justus?
-        self.training_type_plugin.post_training(results, self.checkpoint_callback.best_model_path)
+        results = self.training_type_plugin.post_training(self.checkpoint_callback.best_model_path)
         self.accelerator_backend.teardown()
 
         # ----------------------------
@@ -579,7 +578,49 @@ def fit(
             self._state = TrainerState.FINISHED
         return results or 1
 
+    def pre_training_routine(self):
+        # wait for all to join if on distributed
+        self.accelerator.training_type_plugin.barrier("setup_training")
+
+        # register auto-resubmit when on SLURM
+        self.slurm_connector.register_slurm_signal_handlers()
+
+        # --------------------------
+        # Pre-train
+        # --------------------------
+        # on pretrain routine start
+        ref_model = self.get_model()
+
+        self.on_pretrain_routine_start(ref_model)
+        if self.is_function_implemented("on_pretrain_routine_start"):
+            ref_model.on_pretrain_routine_start()
+
+        # print model summary
+        if self.is_global_zero and self.weights_summary is not None and not self.testing:
+            if self.weights_summary in ModelSummary.MODES:
+                ref_model.summarize(mode=self.weights_summary)
+            else:
+                raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES))
+
+        # TODO: what the heck is this
+        # track model now.
+        # if cluster resets state, the model will update with the saved weights
+        # self.trainer.model = model
+
+        # restore training and model before hpc is called
+        self.checkpoint_connector.restore_weights(ref_model)
+
+        # on pretrain routine end
+        self.on_pretrain_routine_end(ref_model)
+        if self.is_function_implemented("on_pretrain_routine_end"):
+            ref_model.on_pretrain_routine_end()
+
     def train(self):
+        self.pre_training_routine()
+
+        if not self.is_global_zero and self.progress_bar_callback is not None:
+            self.progress_bar_callback.disable()
+
         self.run_sanity_check(self.get_model())
 
         # set stage for logging
@@ -748,6 +789,9 @@ def track_output_for_epoch_end(self, outputs, output):
         return outputs
 
     def run_test(self):
+        if not self.is_global_zero and self.progress_bar_callback is not None:
+            self.progress_bar_callback.disable()
+
         # only load test dataloader for testing
         # self.reset_test_dataloader(ref_model)
         with self.profiler.profile("run_test_evaluation"):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 3b9e704f840b8..066b0818bde21 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -151,39 +151,6 @@ def setup_training(self, model: LightningModule):
             self.trainer.logger.log_graph(ref_model)
             self.trainer.logger.save()
 
-        # wait for all to join if on distributed
-        self.trainer.accelerator.training_type_plugin.barrier("setup_training")
-
-        # register auto-resubmit when on SLURM
-        self.trainer.slurm_connector.register_slurm_signal_handlers()
-
-        if not self.trainer.is_global_zero and self.trainer.progress_bar_callback is not None:
-            self.trainer.progress_bar_callback.disable()
-
-        # --------------------------
-        # Pre-train
-        # --------------------------
-        # on pretrain routine start
-        self.trainer.on_pretrain_routine_start(ref_model)
-        if self.trainer.is_function_implemented("on_pretrain_routine_start"):
-            ref_model.on_pretrain_routine_start()
-
-        # print model summary
-        if self.trainer.is_global_zero and not self.trainer.testing:
-            ref_model.summarize(mode=self.trainer.weights_summary)
-
-        # track model now.
-        # if cluster resets state, the model will update with the saved weights
-        self.trainer.model = model
-
-        # restore training state and model weights before hpc is called
-        self.trainer.checkpoint_connector.restore_weights(model)
-
-        # on pretrain routine end
-        self.trainer.on_pretrain_routine_end(ref_model)
-        if self.trainer.is_function_implemented("on_pretrain_routine_end"):
-            ref_model.on_pretrain_routine_end()
-
     def on_train_end(self):
         if self._teardown_already_run:
             return

From 0f5298ee6624830d7f38840a0b111d21ce55e563 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 13 Dec 2020 19:50:28 +0100
Subject: [PATCH 041/157] remove logger from plugins

---
 .../accelerators/accelerator_connector.py     |  1 -
 .../accelerators/data_parallel.py             | 19 +++++++++----------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 6c23caede81a9..40800db4c1c8c 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -192,7 +192,6 @@ def select_training_type_plugin(self):
             plugin = DDPSpawnPlugin(
                 parallel_device_ids=self.parallel_devices,
                 num_nodes=self.num_nodes,
-                logger=None,
                 cluster_environment=TorchElasticEnvironment(),
                 is_slurm_managing_tasks=False,  # TODO: determine this
             )
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 64517273a9ced..529bfc69648e1 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -41,10 +41,9 @@ class ReduceOp:
 
 
 class TrainingTypePlugin(Plugin, ABC):
-    def __init__(self, logger=None):
+    def __init__(self):
         self._model = None
         self.global_rank = 0
-        self.logger = logger
 
     @property
     @abstractmethod
@@ -123,8 +122,8 @@ def start_testing(self, trainer):
 
 
 class SingleDevicePlugin(TrainingTypePlugin):
-    def __init__(self, device, logger=None):
-        super().__init__(logger=logger)
+    def __init__(self, device):
+        super().__init__()
         self.device: torch.device = device
 
     @property
@@ -161,8 +160,8 @@ def broadcast(self, obj: object, src: int = 0) -> object:
 
 
 class ParallelPlugin(TrainingTypePlugin, ABC):
-    def __init__(self, parallel_device_ids, logger=None, cluster_environment=None):
-        super().__init__(logger=logger)
+    def __init__(self, parallel_device_ids, cluster_environment=None):
+        super().__init__()
         self.parallel_device_ids = parallel_device_ids
         self.local_rank = 0
         self.world_size = 1
@@ -252,11 +251,12 @@ def __init__(
             is_slurm_managing_tasks=False,
             **kwargs: Dict[str, Any],
     ) -> None:
-        super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
+        super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment)
         self.interactive_ddp_procs = []
-        self.dist = LightningDistributed()
         self.num_nodes = num_nodes
+        self.logger = logger
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
+        self.dist = LightningDistributed()
         self._ddp_kwargs = kwargs
         self._has_spawned_children = False
         self.task_idx = None
@@ -476,13 +476,12 @@ def __init__(
         self,
         parallel_device_ids,
         num_nodes=1,
-        logger=None,
         cluster_environment=None,
         is_slurm_managing_tasks=False,
         proc_offset=0,
         **kwargs: Dict[str, Any]
     ):
-        super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment)
+        super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment)
         self.num_nodes = num_nodes
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
         self.proc_offset = proc_offset

From 434e30ebad3debd2e1fb5c195de1afe743a24f54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 13 Dec 2020 19:53:43 +0100
Subject: [PATCH 042/157] setup

---
 pytorch_lightning/trainer/trainer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 5fd80fadfe751..6993d25cb1d94 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -532,6 +532,7 @@ def fit(
         # self.accelerator_backend = self.accelerator_connector.select_accelerator()
         self.call_hook("on_before_accelerator_backend_setup", model)
         self.accelerator_backend.setup(self, model)
+        self.train_loop.setup_training(model)
         self.training_type_plugin.pre_training()
 
         # ----------------------------
@@ -542,8 +543,6 @@ def fit(
         # self.accelerator_backend.validation_loop = self.run_evaluation
         # self.accelerator_backend.test_loop = self.run_evaluation
 
-        self.train_loop.setup_training(model)
-
         # ----------------------------
         # TRAIN
         # ----------------------------

From 3fb31c8bfe2ce9a282ff9835f3266c9e4f260b58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 13 Dec 2020 19:57:33 +0100
Subject: [PATCH 043/157] remove logger arg

---
 pytorch_lightning/accelerators/accelerator_connector.py | 1 -
 pytorch_lightning/accelerators/data_parallel.py         | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 40800db4c1c8c..4683a8b2a5917 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -184,7 +184,6 @@ def select_training_type_plugin(self):
             plugin = DDPPlugin(
                 parallel_device_ids=self.parallel_devices,
                 num_nodes=self.num_nodes,
-                logger=None,
                 cluster_environment=TorchElasticEnvironment(),  # TODO: deterimine this using plugin connector?
                 is_slurm_managing_tasks=False,  # TODO: determine this
             )
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 529bfc69648e1..6875224f62d0a 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -246,7 +246,6 @@ def __init__(
             self,
             parallel_device_ids,
             num_nodes=1,
-            logger=None,
             cluster_environment=None,
             is_slurm_managing_tasks=False,
             **kwargs: Dict[str, Any],
@@ -254,7 +253,6 @@ def __init__(
         super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment)
         self.interactive_ddp_procs = []
         self.num_nodes = num_nodes
-        self.logger = logger
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
         self.dist = LightningDistributed()
         self._ddp_kwargs = kwargs
@@ -334,8 +332,10 @@ def _call_children_scripts(self):
         os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids])
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
 
-        if self.logger is not None:
-            os.environ["PL_EXP_VERSION"] = str(self.logger.version)
+        print("logger", self.lightning_module.logger)
+        if self.lightning_module.logger is not None:
+            os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version)
+        print("exp", os.environ["PL_EXP_VERSION"])
 
         num_gpus = len(self.parallel_device_ids)
         # TODO: Add num_nodes (pass it in?)

From e7a7a87b321eb6602240a40af2a449ef975b3a89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 13 Dec 2020 20:00:34 +0100
Subject: [PATCH 044/157] module

---
 pytorch_lightning/accelerators/data_parallel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 6875224f62d0a..2d05091fe3518 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -288,7 +288,8 @@ def setup(self, model):
 
     @property
     def lightning_module(self):
-        return self._model.module
+        # the model may not be wrapped with DistributedDataParallel if calling this too early
+        return getattr(self._model, "module", self._model)
 
     def _call_children_scripts(self):
 

From 1e8aa44ee3b9917b5a7670cfbd63b7611d9a5fb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 13 Dec 2020 20:01:21 +0100
Subject: [PATCH 045/157] clean up

---
 pytorch_lightning/accelerators/data_parallel.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 2d05091fe3518..7b34acc4b764d 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -333,10 +333,8 @@ def _call_children_scripts(self):
         os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids])
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
 
-        print("logger", self.lightning_module.logger)
         if self.lightning_module.logger is not None:
             os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version)
-        print("exp", os.environ["PL_EXP_VERSION"])
 
         num_gpus = len(self.parallel_device_ids)
         # TODO: Add num_nodes (pass it in?)

From 628fdc3ab447a97a073ca94f3736019bc3393dec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 14 Dec 2020 03:44:10 +0100
Subject: [PATCH 046/157] ddp_cpu integration

---
 .../accelerators/accelerator_connector.py     | 33 +++++++----
 .../accelerators/data_parallel.py             | 55 +++++++++++--------
 pytorch_lightning/trainer/properties.py       |  2 +-
 3 files changed, 54 insertions(+), 36 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 4683a8b2a5917..f1ebbd5950b6c 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -18,7 +18,8 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
-from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin
+from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
+    DataParallelPlugin
 from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
@@ -94,8 +95,8 @@ def __init__(
         # for gpus allow int, string and gpu list
         # if auto_select_gpus and isinstance(gpus, int):
         #     self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus)
-        self.parallel_devices = device_parser.parse_gpu_ids(self.gpus)
-        self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices)
+        self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)
+        self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_device_ids)
         # self.root_device = torch.device("cpu")
 
         self.set_distributed_mode()
@@ -139,15 +140,25 @@ def tpu_id(self):
 
     @property
     def on_gpu(self):
-        return self.parallel_devices and torch.cuda.is_available()
+        return self.parallel_device_ids and torch.cuda.is_available()
 
     @property
     def num_gpus(self) -> int:
-        gpus = self.parallel_devices
+        gpus = self.parallel_device_ids
         if gpus is None:
             return 0
         return len(gpus)
 
+    @property
+    def parallel_devices(self):
+        if self.on_gpu:
+            devices = [torch.device("cuda", i) for i in self.parallel_device_ids]
+        elif self.on_tpu:
+            raise NotImplementedError
+        else:
+            devices = [torch.device("cpu")] * self.num_processes
+        return devices
+
     def select_precision_plugin(self):
         if self.precision == 32:
             self.amp_type = None
@@ -180,16 +191,18 @@ def select_precision_plugin(self):
             raise NotImplementedError('We only support precisions 32 and 16!')
 
     def select_training_type_plugin(self):
-        if self.distributed_backend == "ddp":
+        if self.use_dp and self.distributed_backend == "dp":
+            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
+        elif self.use_ddp and self.distributed_backend == "ddp":
             plugin = DDPPlugin(
-                parallel_device_ids=self.parallel_devices,
+                parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
                 cluster_environment=TorchElasticEnvironment(),  # TODO: deterimine this using plugin connector?
                 is_slurm_managing_tasks=False,  # TODO: determine this
             )
-        elif self.use_ddp and self.distributed_backend == "ddp_spawn":
+        elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"):
             plugin = DDPSpawnPlugin(
-                parallel_device_ids=self.parallel_devices,
+                parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
                 cluster_environment=TorchElasticEnvironment(),
                 is_slurm_managing_tasks=False,  # TODO: determine this
@@ -279,8 +292,6 @@ def set_distributed_mode(self):
                     "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs."
                 )
             self.use_ddp = True
-            self.data_parallel_device_ids = None
-            self.on_gpu = False
 
         # HOROVOD
         elif self.distributed_backend == "horovod":
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 7b34acc4b764d..8e55596f5952b 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -86,7 +86,6 @@ def determine_local_rank(self):
         return int(os.environ.get('LOCAL_RANK', 0))
 
     def determine_node_rank(self):
-
         # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK.
         # otherwise use given node rank or default to node rank 0
         env_vars = ['NODE_RANK', 'GROUP_RANK']
@@ -160,9 +159,9 @@ def broadcast(self, obj: object, src: int = 0) -> object:
 
 
 class ParallelPlugin(TrainingTypePlugin, ABC):
-    def __init__(self, parallel_device_ids, cluster_environment=None):
+    def __init__(self, parallel_devices: List[torch.device], cluster_environment=None):
         super().__init__()
-        self.parallel_device_ids = parallel_device_ids
+        self.parallel_devices = parallel_devices
         self.local_rank = 0
         self.world_size = 1
         self.cluster_environment = cluster_environment
@@ -178,7 +177,7 @@ def root_device(self):
 
     @property
     def on_gpu(self):
-        return self.parallel_device_ids and torch.cuda.is_available()
+        return self.root_device.type == "cuda" and torch.cuda.is_available()
 
     @abstractmethod
     def setup(self, model):
@@ -211,8 +210,9 @@ def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
 
 
 class DataParallelPlugin(ParallelPlugin):
+
     def setup(self, model):
-        self._model = LightningDataParallel(model, self.parallel_device_ids)
+        self._model = LightningDataParallel(model, self.parallel_devices)
 
     def reduce(self, output):
         if isinstance(output, Result):
@@ -225,12 +225,16 @@ def reduce(self, output):
 
     @property
     def root_device(self):
-        return torch.device("cuda", self.parallel_device_ids[0])
+        return self.parallel_devices[0]
 
     @property
     def lightning_module(self):
         return self._model.module
 
+    def model_to_device(self):
+        # no need to do anything when model is wrapped in torch.nn.DataParallel
+        pass
+
     def barrier(self, *args, **kwargs):
         pass
 
@@ -244,13 +248,13 @@ class DDPPlugin(ParallelPlugin):
 
     def __init__(
             self,
-            parallel_device_ids,
+            parallel_devices,
             num_nodes=1,
             cluster_environment=None,
             is_slurm_managing_tasks=False,
             **kwargs: Dict[str, Any],
     ) -> None:
-        super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment)
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.interactive_ddp_procs = []
         self.num_nodes = num_nodes
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
@@ -258,11 +262,11 @@ def __init__(
         self._ddp_kwargs = kwargs
         self._has_spawned_children = False
         self.task_idx = None
-        self.num_processes = len(parallel_device_ids)
+        self.num_processes = len(parallel_devices)
 
     @property
     def root_device(self):
-        return torch.device("cuda", self.parallel_device_ids[self.local_rank])
+        return self.parallel_devices[self.local_rank]
 
     def determine_local_rank(self):
         if self.is_slurm_managing_tasks:
@@ -327,22 +331,20 @@ def _call_children_scripts(self):
         # when the trainer script was called the device has already been scoped by the time
         # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
         # but forward the GPUs selected via environment variables
-        if self.parallel_device_ids is None:
+        if self.parallel_devices is None:
             raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")
 
-        os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids])
+        os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices])
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
 
         if self.lightning_module.logger is not None:
             os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version)
 
-        num_gpus = len(self.parallel_device_ids)
-        # TODO: Add num_nodes (pass it in?)
+        num_gpus = len(self.parallel_devices)
         os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
 
         self.interactive_ddp_procs = []
 
-        # TODO: Add num_processes (pass it in?)
         for local_rank in range(1, self.num_processes):
             env_copy = os.environ.copy()
             env_copy["LOCAL_RANK"] = f"{local_rank}"
@@ -388,6 +390,8 @@ def configure_ddp(self):
         )
 
     def determine_ddp_device_ids(self):
+        if self.root_device.type == "cpu":
+            return None
         return [self.root_device.index]
 
     def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
@@ -456,9 +460,8 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         return self.dist.broadcast(obj)
 
     def model_to_device(self):
-        # TODO: Can we easily make this a property that falls back here?
-        # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
-        torch.cuda.set_device(self.root_device)
+        if self.root_device.type == "cuda":
+            torch.cuda.set_device(self.root_device)
         self.model.to(self.root_device)
 
     def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
@@ -473,25 +476,25 @@ class DDPSpawnPlugin(ParallelPlugin):
 
     def __init__(
         self,
-        parallel_device_ids,
+        parallel_devices,
         num_nodes=1,
         cluster_environment=None,
         is_slurm_managing_tasks=False,
         proc_offset=0,
         **kwargs: Dict[str, Any]
     ):
-        super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment)
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.num_nodes = num_nodes
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
         self.proc_offset = proc_offset
         self._ddp_kwargs = kwargs
         self.dist = LightningDistributed()
-        self.num_processes = len(parallel_device_ids)
+        self.num_processes = len(parallel_devices)
         self.mp_queue = None
 
     @property
     def root_device(self):
-        return torch.device("cuda", self.parallel_device_ids[self.local_rank])
+        return self.parallel_devices[self.local_rank]
 
     @property
     def lightning_module(self):
@@ -570,6 +573,7 @@ def new_process(self, process_idx, mp_queue, trainer, model, proc_offset):
 
     def post_training(self, best_model_path):
         # clean up memory
+        # TODO: move this to gpu accelerator
         torch.cuda.empty_cache()
 
         # restore main state with best weights
@@ -602,7 +606,9 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
             torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
 
     def determine_ddp_device_ids(self):
-        return [self.root_device]
+        if self.root_device.type == "cpu":
+            return None
+        return [self.root_device.index]
 
     def transfer_distrib_spawn_state_on_fit_end(self, results):
         # TODO: is there a better way than accessing callback through model -> trainer -> callback?
@@ -655,7 +661,8 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         return self.dist.broadcast(obj)
 
     def model_to_device(self):
-        torch.cuda.set_device(self.root_device)
+        if self.root_device.type == "cuda":
+            torch.cuda.set_device(self.root_device)
         self.model.to(self.root_device)
 
     def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 02844cb1375bd..86d146783e2f3 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -135,7 +135,7 @@ def use_tpu(self):
 
     @property
     def num_nodes(self):
-        return self.accelerator_connector.num_gpus
+        return self.accelerator_connector.num_nodes
 
     @property
     def num_processes(self):

From 9f369cc03d9e6a295b8b5382d9f5e7232c8b1e2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 14 Dec 2020 07:21:00 +0100
Subject: [PATCH 047/157] cuda context manager for emptying cache

---
 pytorch_lightning/accelerators/data_parallel.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 8e55596f5952b..d76a7f291aa94 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -447,7 +447,8 @@ def pre_training(self):
         self.barrier()
 
     def post_training(self, best_model_path):
-        torch.cuda.empty_cache()
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()
 
         if "WORLD_SIZE" in os.environ:
             del os.environ["WORLD_SIZE"]
@@ -573,8 +574,8 @@ def new_process(self, process_idx, mp_queue, trainer, model, proc_offset):
 
     def post_training(self, best_model_path):
         # clean up memory
-        # TODO: move this to gpu accelerator
-        torch.cuda.empty_cache()
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()
 
         # restore main state with best weights
         best_path = self.mp_queue.get()

From a8e830609837a70e4d092f7cd626cbbf01eed8ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 14 Dec 2020 07:23:48 +0100
Subject: [PATCH 048/157] args

---
 pytorch_lightning/accelerators/data_parallel.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index d76a7f291aa94..8d6e23eac0879 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -481,13 +481,11 @@ def __init__(
         num_nodes=1,
         cluster_environment=None,
         is_slurm_managing_tasks=False,
-        proc_offset=0,
         **kwargs: Dict[str, Any]
     ):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.num_nodes = num_nodes
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
-        self.proc_offset = proc_offset
         self._ddp_kwargs = kwargs
         self.dist = LightningDistributed()
         self.num_processes = len(parallel_devices)
@@ -518,18 +516,17 @@ def set_world_ranks(self, process_idx):
         self.world_size = self.num_nodes * self.num_processes
 
     def start_training(self, trainer):
-        mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,))
+        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,))
 
     def start_testing(self, trainer):
-        mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,))
+        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer, ))
 
-    def new_process(self, process_idx, mp_queue, trainer, model, proc_offset):
+    def new_process(self, process_idx, trainer):
         # TODO: check if needed
         seed = os.environ.get("PL_GLOBAL_SEED")
         if seed is not None:
             seed_everything(int(seed))
 
-        process_idx = process_idx + proc_offset
         self.set_world_ranks(process_idx)
 
         # set warning rank

From 71cbd334fc4db672c770aef811cbd8c088cbbe1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 14 Dec 2020 07:33:43 +0100
Subject: [PATCH 049/157] move "log_gpu_memory" to logger connector

---
 .../accelerators/accelerator_connector.py     |  2 --
 .../logger_connector/logger_connector.py      |  8 +++++---
 pytorch_lightning/trainer/trainer.py          | 19 +------------------
 3 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index f1ebbd5950b6c..75ecf398c1ec7 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -53,7 +53,6 @@ def __init__(
         auto_select_gpus,
         gpus,
         num_nodes,
-        log_gpu_memory,
         sync_batchnorm,
         benchmark,
         replace_sampler_ddp,
@@ -76,7 +75,6 @@ def __init__(
         self.auto_select_gpus = auto_select_gpus
         self.gpus = gpus
         self.num_nodes = num_nodes
-        self.log_gpu_memory = log_gpu_memory
         self.sync_batchnorm = sync_batchnorm
         self.benchmark = benchmark
         self.replace_sampler_ddp = replace_sampler_ddp
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 8e992f8f12034..887ed2f30979b 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -30,8 +30,10 @@
 
 
 class LoggerConnector:
-    def __init__(self, trainer):
+
+    def __init__(self, trainer, log_gpu_memory):
         self.trainer = trainer
+        self.log_gpu_memory = log_gpu_memory
         self._callback_metrics = MetricsHolder()
         self._evaluation_callback_metrics = MetricsHolder(to_float=True)
         self._logged_metrics = MetricsHolder()
@@ -219,8 +221,8 @@ def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=
                 and global_step for the rest.
         """
         # add gpu memory
-        if self.trainer._device_type == DeviceType.GPU and self.trainer.log_gpu_memory:
-            mem_map = memory.get_memory_profile(self.trainer.log_gpu_memory)
+        if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
+            mem_map = memory.get_memory_profile(self.log_gpu_memory)
             metrics.update(mem_map)
 
         # add norms
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 6993d25cb1d94..27ce210fd4630 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -328,7 +328,6 @@ def __init__(
             auto_select_gpus,
             gpus,
             num_nodes,
-            log_gpu_memory,
             sync_batchnorm,
             benchmark,
             replace_sampler_ddp,
@@ -337,7 +336,7 @@ def __init__(
             amp_backend,
             amp_level,
         )
-        self.logger_connector = LoggerConnector(self)
+        self.logger_connector = LoggerConnector(self, log_gpu_memory)
         self.model_connector = ModelConnector(self)
         # self.precision_connector = PrecisionConnector(self)
         self.callback_connector = CallbackConnector(self)
@@ -383,22 +382,6 @@ def __init__(
             gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan
         )
 
-        # init accelerator related flags
-        # self.accelerator_connector.on_trainer_init(
-        #     num_processes,
-        #     tpu_cores,
-        #     accelerator,
-        #     distributed_backend,
-        #     auto_select_gpus,
-        #     gpus,
-        #     num_nodes,
-        #     log_gpu_memory,
-        #     sync_batchnorm,
-        #     benchmark,
-        #     replace_sampler_ddp,
-        #     deterministic,
-        # )
-
         # init train loop related flags
         # TODO: remove in 1.3.0
         if automatic_optimization is None:

From 1a9ad4fa173b5c07275cac7bc90947690f242510 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Mon, 14 Dec 2020 16:14:19 +0100
Subject: [PATCH 050/157] fix imports

---
 pytorch_lightning/accelerators/accelerator.py           | 2 +-
 pytorch_lightning/accelerators/accelerator_connector.py | 2 +-
 pytorch_lightning/trainer/trainer.py                    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 722328dd66325..c6d6221fc11cc 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,7 +1,7 @@
 from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin
 from pytorch_lightning.accelerators.base_plugin import Plugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities import AMPType, NATIVE_AMP_AVALAIBLE
+from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE, AMPType
 from typing import Any, Union
 import math
 
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 75ecf398c1ec7..6aad549d4cdfb 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -21,7 +21,7 @@
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
     DataParallelPlugin
 from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
-from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser
+from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 27ce210fd4630..e15132a5849cb 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -24,8 +24,8 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
+from pytorch_lightning.core.memory import ModelSummary
+from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint

From 7b874cc249f7eb2d421d6785e4aac3389b842bbb Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:47:43 +0100
Subject: [PATCH 051/157] typo

---
 pytorch_lightning/accelerators/accelerator_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 6aad549d4cdfb..8abc5db36340b 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -164,7 +164,7 @@ def select_precision_plugin(self):
 
         elif self.precision == 16:
             if self.amp_type == 'native':
-                if not NATIVE_AMP_AVALAIBLE:
+                if not NATIVE_AMP_AVAILABLE:
                     rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.'
                                 ' Consider upgrading with `pip install torch>=1.6`.'
                                 ' We will attempt to use NVIDIA Apex for this session.')

From bc2460aee8395546bb63cb041f4609887e589266 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:48:02 +0100
Subject: [PATCH 052/157] remove todo

---
 pytorch_lightning/accelerators/accelerator.py | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index c6d6221fc11cc..3f24d6b01c71d 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -110,7 +110,6 @@ def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, *
         )
 
     def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
-        # TODO: Check out if this can be simplified with new LightningOptimizer!
 
         model_ref = self.lightning_module
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
@@ -232,11 +231,30 @@ def precision(self):
 
     @property
     def scaler(self):
-        if hasattr(self.precision_plugin, 'scaler'):
+        if hasattr(self.precision_plugin, "scaler"):
             return self.precision_plugin.scaler
 
         return None
 
+    @property
+    def rpc_enabled(self):
+        return self.training_type_plugin.rpc_enabled
+
+    # TODO: Check where this comes from and why it is needed
+    def optimizer_state(self, optimizer: Optimizer) -> dict:
+        """
+        Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom
+        plugins.
+        Return:
+            Optimizer state dict
+        """
+        if self.training_type_plugin and hasattr(self.training_type_plugin, "optimizer_state"):
+            return self.training_type_plugin.optimizer_state(optimizer)
+        return optimizer.state_dict()
+
+    def on_save(self, checkpoint):
+        return checkpoint
+
 
 class NewCPUAccelerator(NewAccelerator):
     def setup(self, trainer, model):

From 506c44632540ade383aa0d2e11b4036d023958a9 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:48:19 +0100
Subject: [PATCH 053/157] add rpc_enabled flag

---
 pytorch_lightning/accelerators/data_parallel.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 8d6e23eac0879..331968ca9ee66 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -119,6 +119,10 @@ def start_testing(self, trainer):
         # double dispatch to initiate the test loop
         return trainer.run_test()
 
+    @property
+    def rpc_enabled(self):
+        return False
+
 
 class SingleDevicePlugin(TrainingTypePlugin):
     def __init__(self, device):

From 19d19d575852aafdd90ab9f00af433269549534c Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:48:30 +0100
Subject: [PATCH 054/157] remove unused self arg

---
 pytorch_lightning/accelerators/scheduler_properties.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/scheduler_properties.py b/pytorch_lightning/accelerators/scheduler_properties.py
index 6835df4499385..37dbdd13c3c58 100644
--- a/pytorch_lightning/accelerators/scheduler_properties.py
+++ b/pytorch_lightning/accelerators/scheduler_properties.py
@@ -1,7 +1,7 @@
 from torch import optim
 
 
-def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
+def reinit_scheduler_properties(optimizers: list, schedulers: list):
         # Reinitialize optimizer.step properties added by schedulers
         for scheduler in schedulers:
             scheduler = scheduler['scheduler']

From dd4d148b42464e076c11ece42fea01beac0f5dde Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:49:02 +0100
Subject: [PATCH 055/157] comment out unnexessary amp part

---
 pytorch_lightning/core/optimizer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
index acba35d9ae0ac..03559065725fe 100644
--- a/pytorch_lightning/core/optimizer.py
+++ b/pytorch_lightning/core/optimizer.py
@@ -129,8 +129,9 @@ def __optimizer_step(self, *args, closure: Optional[Callable] = None, profiler_n
             with trainer.profiler.profile(profiler_name):
                 xm.optimizer_step(optimizer, optimizer_args={'closure': closure, **kwargs})
 
-        elif trainer.amp_backend is not None:
-            trainer.precision_connector.backend.optimizer_step(trainer, optimizer, closure)
+        # elif trainer.amp_backend is not None:
+        #     # TODO: Adapt for new optimizer structure
+        #     trainer.precision_connector.backend.optimizer_step(trainer, optimizer, closure)
 
         else:
             with trainer.profiler.profile(profiler_name):

From f2fffc69cd0dcddf2e28c2ad97bb606bdc8d47f7 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:49:23 +0100
Subject: [PATCH 056/157] fix model connector

---
 pytorch_lightning/trainer/connectors/model_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/connectors/model_connector.py b/pytorch_lightning/trainer/connectors/model_connector.py
index a4bf9a6e505e6..563b664fffbc4 100644
--- a/pytorch_lightning/trainer/connectors/model_connector.py
+++ b/pytorch_lightning/trainer/connectors/model_connector.py
@@ -44,5 +44,5 @@ def get_model(self):
 
     def _get_reference_model(self, model):
         if self.trainer.accelerator_backend:
-            return self.trainer.accelerator_backend.get_reference_model(model)
+            return self.trainer.accelerator_backend.lightning_module
         return model

From c6b3aeb8b17e304f36ee956e5fcc32ae23e97083 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:49:39 +0100
Subject: [PATCH 057/157] fix import

---
 pytorch_lightning/trainer/trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index e15132a5849cb..60e5a93b97d4e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -15,6 +15,7 @@
 """Trainer to automate the training."""
 
 import os
+from pytorch_lightning.core.memory import ModelSummary
 from pytorch_lightning.accelerators.precision import PrecisionPlugin
 import warnings
 from pathlib import Path

From 55fc9527ff2bad6f9419f6c9da0a7b28dfbc376f Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 14 Dec 2020 16:49:56 +0100
Subject: [PATCH 058/157] copy properties only once

---
 pytorch_lightning/trainer/training_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 066b0818bde21..bc42de5aed110 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -105,8 +105,8 @@ def on_train_start(self):
         self.trainer.call_hook("on_train_start")
 
     def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule):
-        # bind logger and other properties
-        self.trainer.model_connector.copy_trainer_model_properties(model)
+        # # bind logger and other properties
+        # self.trainer.model_connector.copy_trainer_model_properties(model)
 
         # clean hparams
         if hasattr(model, "hparams"):

From 177a634c8245926b471ddfb0df279d05d7a83a1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 03:16:54 +0100
Subject: [PATCH 059/157] add cluster env

---
 .../accelerators/accelerator_connector.py     | 40 ++++++++++++-------
 .../trainer/connectors/slurm_connector.py     |  4 +-
 pytorch_lightning/trainer/trainer.py          |  3 +-
 tests/backends/test_accelerator_connector.py  |  9 +++--
 4 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 8abc5db36340b..21e8a61e333ac 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 from typing import Union
 
-from pytorch_lightning import accelerators
 import os
 import torch
 
@@ -59,7 +58,8 @@ def __init__(
         deterministic,
         precision,
         amp_type, 
-        amp_level
+        amp_level,
+        is_slurm_managing_tasks,
     ):
 
         # initialization
@@ -82,6 +82,7 @@ def __init__(
         self.precision = precision
         self.amp_type = None if amp_type is None else amp_type.lower()
         self.amp_level = amp_level
+        self.is_slurm_managing_tasks = is_slurm_managing_tasks
 
         # init the default rank if exists
         # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
@@ -110,12 +111,6 @@ def __init__(
         # init flags for SLURM+DDP to work
         self.world_size = 1
         self.interactive_ddp_procs = []
-
-        # link up SLURM
-        # TODO: this should be taken out of here... but depends too much on DDP
-        # self.slurm_connector.on_trainer_init(self.num_nodes)
-        # self.node_rank = self.determine_ddp_node_rank()
-        # self.local_rank = self.determine_local_rank()
         self.global_rank = 0
 
         # NVIDIA setup
@@ -182,28 +177,26 @@ def select_precision_plugin(self):
                     log.info('Using APEX 16bit precision.')
                     self.amp_type = AMPType.APEX
                     return ApexMixedPrecisionPlugin(self.amp_level)
-
-
-        
         else:
             raise NotImplementedError('We only support precisions 32 and 16!')
 
     def select_training_type_plugin(self):
+        cluster_environment = self.select_cluster_environment()
         if self.use_dp and self.distributed_backend == "dp":
             plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
         elif self.use_ddp and self.distributed_backend == "ddp":
             plugin = DDPPlugin(
                 parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
-                cluster_environment=TorchElasticEnvironment(),  # TODO: deterimine this using plugin connector?
-                is_slurm_managing_tasks=False,  # TODO: determine this
+                cluster_environment=cluster_environment,
+                is_slurm_managing_tasks=self.is_slurm_managing_tasks,
             )
         elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"):
             plugin = DDPSpawnPlugin(
                 parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
-                cluster_environment=TorchElasticEnvironment(),
-                is_slurm_managing_tasks=False,  # TODO: determine this
+                cluster_environment=cluster_environment,
+                is_slurm_managing_tasks=self.is_slurm_managing_tasks,
             )
         else:
             # TODO: cover all other cases
@@ -225,6 +218,23 @@ def select_accelerator(self):
             training_type_plugin=self.select_training_type_plugin(),
         )
 
+    def select_cluster_environment(self):
+        # TODO: support the cloud environment set by the plugin connector!
+        # if self.trainer.plugin_connector.cloud_environment:
+        #     env = self.trainer.plugin_connector.cloud_environment
+        # elif self.is_slurm_managing_tasks:
+        if self.is_slurm_managing_tasks:
+            env = SLURMEnvironment()
+        elif self._is_using_torchelastic():
+            env = TorchElasticEnvironment()
+        else:
+            env = TorchElasticEnvironment()
+        return env
+
+    def _is_using_torchelastic(self):
+        te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
+        return te_flags_passed
+
     def set_distributed_mode(self):
 
         # No distributed backend
diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py
index ad860c0b154b2..212e126e4bac3 100644
--- a/pytorch_lightning/trainer/connectors/slurm_connector.py
+++ b/pytorch_lightning/trainer/connectors/slurm_connector.py
@@ -13,10 +13,8 @@
 
 class SLURMConnector:
 
-    def __init__(self, trainer):
+    def __init__(self, trainer, num_gpu_nodes):
         self.trainer = trainer
-
-    def on_trainer_init(self, num_gpu_nodes):
         self.configure_slurm_ddp(num_gpu_nodes)
 
     def configure_slurm_ddp(self, num_gpu_nodes):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 60e5a93b97d4e..14eb8e81d95ea 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -322,6 +322,7 @@ def __init__(
         self.config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self)
         self.optimizer_connector = OptimizerConnector(self)
+        self.slurm_connector = SLURMConnector(self, num_nodes)
         self.accelerator_connector = BackendConnector(
             num_processes,
             tpu_cores,
@@ -336,6 +337,7 @@ def __init__(
             precision,
             amp_backend,
             amp_level,
+            self.is_slurm_managing_tasks,  # set by slurm connector
         )
         self.logger_connector = LoggerConnector(self, log_gpu_memory)
         self.model_connector = ModelConnector(self)
@@ -345,7 +347,6 @@ def __init__(
         self.training_tricks_connector = TrainingTricksConnector(self)
         self.profile_connector = ProfilerConnector(self)
         self.checkpoint_connector = CheckpointConnector(self)
-        self.slurm_connector = SLURMConnector(self)
         self.tuner = Tuner(self)
         self.evaluation_loop = EvaluationLoop(self)
         self.train_loop = TrainLoop(self, multiple_trainloader_mode)
diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index f13830f68d8d6..1dddd48ea0d25 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -17,8 +17,10 @@
 
 import pytest
 
-from pytorch_lightning import accelerators, Trainer
-from pytorch_lightning.accelerators import Accelerator
+from pytorch_lightning import Trainer, accelerators
+from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator
+from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin
+from pytorch_lightning.accelerators.old.accelerator import Accelerator
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning.utilities import DistributedType
@@ -28,7 +30,8 @@
 def test_accelerator_choice_cpu(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, accelerators.CPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.accelerator_backend.training_type_plugin, SingleDevicePlugin)
             assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
 
     model = BoringModel()

From 7290e99ae50262242c99eafd0da29e69d37675fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 03:27:31 +0100
Subject: [PATCH 060/157] move slurm configuration

---
 .../accelerators/accelerator_connector.py     |  40 ++++++-
 .../trainer/connectors/slurm_connector.py     | 102 +-----------------
 pytorch_lightning/trainer/trainer.py          |   3 +-
 3 files changed, 40 insertions(+), 105 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 21e8a61e333ac..ad012ee1f6ead 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -59,7 +59,6 @@ def __init__(
         precision,
         amp_type, 
         amp_level,
-        is_slurm_managing_tasks,
     ):
 
         # initialization
@@ -82,7 +81,7 @@ def __init__(
         self.precision = precision
         self.amp_type = None if amp_type is None else amp_type.lower()
         self.amp_level = amp_level
-        self.is_slurm_managing_tasks = is_slurm_managing_tasks
+        self.is_slurm_managing_tasks = False
 
         # init the default rank if exists
         # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
@@ -99,6 +98,7 @@ def __init__(
         # self.root_device = torch.device("cpu")
 
         self.set_distributed_mode()
+        self.configure_slurm_ddp()
 
         # todo: select accelerator based on trainer flags
         self.accelerator = self.select_accelerator()
@@ -347,3 +347,39 @@ def check_horovod(self):
     def has_horovodrun():
         """Returns True if running with `horovodrun` using Gloo or OpenMPI."""
         return "OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ
+
+    def configure_slurm_ddp(self):
+        # extract SLURM flag vars
+        # whenever we have the correct number of tasks, we let slurm manage processes
+        # otherwise we launch the required number of processes
+        if self.use_ddp or self.use_ddp2:
+            num_requested_gpus = self.num_gpus * self.num_nodes
+            num_slurm_tasks = 0
+            try:
+                num_slurm_tasks = int(os.environ['SLURM_NTASKS'])
+                self.is_slurm_managing_tasks = num_slurm_tasks == num_requested_gpus
+
+                # enable slurm cpu
+                if num_requested_gpus == 0:
+                    self.is_slurm_managing_tasks = num_slurm_tasks == self.num_processes
+
+                # in interactive mode we don't manage tasks
+                job_name = os.environ['SLURM_JOB_NAME']
+                if job_name == 'bash':
+                    self.is_slurm_managing_tasks = False
+
+            except Exception:
+                # likely not on slurm, so set the slurm managed flag to false
+                self.is_slurm_managing_tasks = False
+
+        # used for tests only, set this flag to simulate slurm managing a task
+        try:
+            should_fake = int(os.environ['FAKE_SLURM_MANAGING_TASKS'])
+            if should_fake:
+                self.is_slurm_managing_tasks = True
+        except Exception:
+            pass
+
+        # notify user the that slurm is managing tasks
+        if self.is_slurm_managing_tasks:
+            rank_zero_info('Multi-processing is handled by Slurm.')
diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py
index 212e126e4bac3..02552dd67de26 100644
--- a/pytorch_lightning/trainer/connectors/slurm_connector.py
+++ b/pytorch_lightning/trainer/connectors/slurm_connector.py
@@ -1,69 +1,14 @@
 import os
-import re
 import signal
 from subprocess import call
 
-import torch
-import torch.distributed as torch_distrib
-
 from pytorch_lightning import _logger as log
-from pytorch_lightning.utilities import DeviceType, DistributedType
-from pytorch_lightning.utilities.distributed import rank_zero_info
 
 
 class SLURMConnector:
 
-    def __init__(self, trainer, num_gpu_nodes):
+    def __init__(self, trainer):
         self.trainer = trainer
-        self.configure_slurm_ddp(num_gpu_nodes)
-
-    def configure_slurm_ddp(self, num_gpu_nodes):
-        self.trainer.is_slurm_managing_tasks = False
-
-        # extract SLURM flag vars
-        # whenever we have the correct number of tasks, we let slurm manage processes
-        # otherwise we launch the required number of processes
-        if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2):
-            self.trainer.num_requested_gpus = self.trainer.num_gpus * num_gpu_nodes
-            self.trainer.num_slurm_tasks = 0
-            try:
-                self.trainer.num_slurm_tasks = int(os.environ['SLURM_NTASKS'])
-                self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_requested_gpus
-
-                # enable slurm cpu
-                if self.trainer.num_requested_gpus == 0:
-                    self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_processes
-
-                # in interactive mode we don't manage tasks
-                job_name = os.environ['SLURM_JOB_NAME']
-                if job_name == 'bash':
-                    self.trainer.is_slurm_managing_tasks = False
-            # todo: specify the possible exception
-            except Exception:
-                # likely not on slurm, so set the slurm managed flag to false
-                self.trainer.is_slurm_managing_tasks = False
-
-        # used for tests only, set this flag to simulate slurm managing a task
-        should_fake = os.environ.get('FAKE_SLURM_MANAGING_TASKS')
-        if should_fake and int(should_fake):
-            self.trainer.is_slurm_managing_tasks = True
-
-        # notify user the that slurm is managing tasks
-        if self.trainer.is_slurm_managing_tasks:
-            rank_zero_info('Multi-processing is handled by Slurm.')
-
-    # todo: the same function as slurm_environment.py `_resolve_root_node_address`
-    def resolve_root_node_address(self, root_node):
-        if '[' in root_node:
-            name, numbers = root_node.split('[', maxsplit=1)
-            number = numbers.split(',', maxsplit=1)[0]
-            if '-' in number:
-                number = number.split('-')[0]
-
-            number = re.sub('[^0-9]', '', number)
-            root_node = name + number
-
-        return root_node
 
     def register_slurm_signal_handlers(self):
         # see if we're using slurm (not interactive)
@@ -110,48 +55,3 @@ def term_handler(self, signum, frame):
         # Todo: required argument `signum` is not used
         # Todo: required argument `frame` is not used
         log.info("bypassing sigterm")
-
-    # todo: this is the same func as slurm_environment.py `master_port`
-    def connect_ddp(self, global_rank: int, world_size: int) -> None:
-        """
-        Sets up environment variables necessary for pytorch distributed communications
-        based on slurm environment.
-        """
-        # use slurm job id for the port number
-        # guarantees unique ports across jobs from same grid search
-        default_port = os.environ.get("SLURM_JOB_ID")
-        if default_port:
-            # use the last 4 numbers in the job id as the id
-            default_port = default_port[-4:]
-            # all ports should be in the 10k+ range
-            default_port = int(default_port) + 15000
-        else:
-            default_port = 12910
-
-        # if user gave a port number, use that one instead
-        if "MASTER_PORT" in os.environ:
-            default_port = os.environ["MASTER_PORT"]
-        else:
-            os.environ["MASTER_PORT"] = str(default_port)
-        log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
-
-        # figure out the root node addr
-        root_node = os.environ.get("SLURM_NODELIST")
-        if root_node:
-            root_node = root_node.split(" ")[0]
-        else:
-            root_node = "127.0.0.1"
-
-        root_node = self.trainer.slurm_connector.resolve_root_node_address(root_node)
-        os.environ["MASTER_ADDR"] = root_node
-        log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
-
-        torch_backend = "nccl" if self.trainer._device_type == DeviceType.GPU else "gloo"
-
-        if not torch.distributed.is_initialized():
-            log.info(
-                f"initializing ddp (SLURM): GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}"
-            )
-            torch_distrib.init_process_group(
-                torch_backend, rank=global_rank, world_size=world_size
-            )
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 14eb8e81d95ea..60e5a93b97d4e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -322,7 +322,6 @@ def __init__(
         self.config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self)
         self.optimizer_connector = OptimizerConnector(self)
-        self.slurm_connector = SLURMConnector(self, num_nodes)
         self.accelerator_connector = BackendConnector(
             num_processes,
             tpu_cores,
@@ -337,7 +336,6 @@ def __init__(
             precision,
             amp_backend,
             amp_level,
-            self.is_slurm_managing_tasks,  # set by slurm connector
         )
         self.logger_connector = LoggerConnector(self, log_gpu_memory)
         self.model_connector = ModelConnector(self)
@@ -347,6 +345,7 @@ def __init__(
         self.training_tricks_connector = TrainingTricksConnector(self)
         self.profile_connector = ProfilerConnector(self)
         self.checkpoint_connector = CheckpointConnector(self)
+        self.slurm_connector = SLURMConnector(self)
         self.tuner = Tuner(self)
         self.evaluation_loop = EvaluationLoop(self)
         self.train_loop = TrainLoop(self, multiple_trainloader_mode)

From 1b9c095f6d1da8dabf94b51282bbd8586cc75b4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 03:38:35 +0100
Subject: [PATCH 061/157] resolve importerrors

---
 pytorch_lightning/accelerators/accelerator.py | 8 +++++++-
 tests/core/test_datamodules.py                | 2 --
 tests/models/test_gpu.py                      | 5 ++---
 tests/models/test_hooks.py                    | 2 --
 tests/models/test_horovod.py                  | 9 +++++----
 tests/models/test_tpu.py                      | 8 ++++----
 6 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 3f24d6b01c71d..242be59c082bf 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -284,4 +284,10 @@ def on_train_start(self):
             torch.cuda.empty_cache()
 
 
-# TODO: Add NewTPUAccelerator
\ No newline at end of file
+# TODO: Complete the TPUAccelerator
+class NewTPUAccelerator(NewAccelerator):
+    def setup(self, trainer, model):
+        raise NotImplementedError
+
+    def on_train_start(self):
+        raise NotImplementedError
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index dd7f7e8614f6f..9817e3c85a7e0 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -20,7 +20,6 @@
 import torch
 
 from pytorch_lightning import LightningDataModule, Trainer
-from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
 from tests.base import BoringDataModule, BoringModel
@@ -419,7 +418,6 @@ def transfer_batch_to_device(self, data, device):
 
     model.transfer_batch_to_device = dm.transfer_batch_to_device
 
-    trainer.accelerator_backend = GPUAccelerator(trainer)
     batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0'))
     expected = torch.device('cuda', 0)
     assert dm.hook_called
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 7cfeb8f0ae53e..4bf854da4b8d8 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -24,7 +24,8 @@
 from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import BoringModel
+from tests.base import EvalModelTemplate
+
 
 PRETEND_N_OF_GPUS = 16
 
@@ -210,7 +211,6 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_single_gpu_batch_parse():
     trainer = Trainer(gpus=1)
-    trainer.accelerator_backend = GPUAccelerator(trainer)
 
     # non-transferrable types
     primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}]
@@ -306,7 +306,6 @@ def to(self, *args, **kwargs):
 def test_non_blocking():
     """ Tests that non_blocking=True only gets passed on torch.Tensor.to, but not on other objects. """
     trainer = Trainer()
-    trainer.accelerator_backend = GPUAccelerator(trainer)
 
     batch = torch.zeros(2, 3)
     with patch.object(batch, 'to', wraps=batch.to) as mocked:
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 1f25d46f82944..0565ba594179f 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -18,7 +18,6 @@
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.trainer.states import TrainerState
 from tests.base import BoringModel, EvalModelTemplate
 
@@ -116,7 +115,6 @@ def transfer_batch_to_device(self, data, device):
     batch = CustomBatch((torch.zeros(5, 28), torch.ones(5, 1, dtype=torch.long)))
 
     trainer = Trainer(gpus=1)
-    trainer.accelerator_backend = GPUAccelerator(trainer)
     # running .fit() would require us to implement custom data loaders, we mock the model reference instead
     trainer.get_model = MagicMock(return_value=model)
     batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0'))
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 7ac7cd235f392..6b2eaef1f1da8 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -26,7 +26,8 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator
+from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator
+from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
@@ -311,12 +312,12 @@ def _compute_batch():
             accelerator='horovod',
         )
 
-        accelerator_backend = trainer.accelerator_connector.select_accelerator()
-        assert isinstance(accelerator_backend, HorovodAccelerator)
+        assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+        # TODO: test that we selected the correct training_type_plugin based on horovod flags
 
         metric = Accuracy(compute_on_step=True,
                           dist_sync_on_step=True,
-                          dist_sync_fn=accelerator_backend.gather_all_tensors,
+                          dist_sync_fn=trainer.accelerator_backend.gather_all_tensors,
                           threshold=threshold)
 
         for i in range(hvd.rank(), num_batches, hvd.size()):
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 5e977eed765d0..45cd9b2154c43 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -19,8 +19,8 @@
 from torch.utils.data import DataLoader
 
 import tests.base.develop_pipelines as tpipes
-from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators import TPUAccelerator
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.accelerators.accelerator import NewTPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE
@@ -250,9 +250,9 @@ def test_broadcast_on_tpu():
     """ Checks if an object from the master process is broadcasted to other processes correctly"""
     def test_broadcast(rank):
         trainer = Trainer(tpu_cores=8)
-        backend = TPUAccelerator(trainer)
+        assert isinstance(trainer.accelerator_backend, NewTPUAccelerator)
         obj = ("ver_0.5", "logger_name", rank)
-        result = backend.broadcast(obj)
+        result = trainer.accelerator_backend.broadcast(obj)
         assert result == ("ver_0.5", "logger_name", 0)
 
     xmp.spawn(test_broadcast, nprocs=8, start_method='fork')

From e50aea912861256f11cb6f6b727678dae302ca8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 05:16:59 +0100
Subject: [PATCH 062/157] handle distributed_sampler_kwargs

---
 .../accelerators/data_parallel.py             | 34 ++++++++++++++++---
 pytorch_lightning/trainer/properties.py       |  3 +-
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 331968ca9ee66..b5f774f9b7bed 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -195,6 +195,14 @@ def connect(self, model):
     def is_global_zero(self) -> bool:
         return self.global_rank == 0
 
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(
+            num_replicas=len(self.parallel_devices),
+            rank=self.global_rank
+        )
+        return distributed_sampler_kwargs
+
     @staticmethod
     def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
         """
@@ -272,6 +280,19 @@ def __init__(
     def root_device(self):
         return self.parallel_devices[self.local_rank]
 
+    @property
+    def lightning_module(self):
+        # the model may not be wrapped with DistributedDataParallel if calling this too early
+        return getattr(self._model, "module", self._model)
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(
+            num_replicas=(self.num_nodes * self.num_processes),
+            rank=self.global_rank
+        )
+        return distributed_sampler_kwargs
+
     def determine_local_rank(self):
         if self.is_slurm_managing_tasks:
             return int(os.environ['SLURM_LOCALID'])
@@ -294,11 +315,6 @@ def setup(self, model):
         # set the task idx
         self.task_idx = int(os.environ["LOCAL_RANK"])
 
-    @property
-    def lightning_module(self):
-        # the model may not be wrapped with DistributedDataParallel if calling this too early
-        return getattr(self._model, "module", self._model)
-
     def _call_children_scripts(self):
 
         # bookkeeping of spawned processes
@@ -504,6 +520,14 @@ def lightning_module(self):
         # the model may not be wrapped with DistributedDataParallel if calling this too early
         return getattr(self._model, "module", self._model)
 
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(
+            num_replicas=(self.num_nodes * self.num_processes),
+            rank=self.global_rank
+        )
+        return distributed_sampler_kwargs
+
     def setup(self, model):
         self._model = model
 
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 86d146783e2f3..97d9885e57f32 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -365,8 +365,9 @@ def require_distributed_sampler(self):
     @property
     def distributed_sampler_kwargs(self):
         if self.accelerator_backend is not None:
-            return self.accelerator_backend.distributed_sampler_kwargs
+            return self.training_type_plugin.distributed_sampler_kwargs
 
+        # TODO: make sure the cases below are handled by the training_type_plugin
         if self._device_type == DeviceType.TPU:
             kwargs = dict(num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())
 

From 2e8f9444f70d9075b25ea2062de8b479ea3a661f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 05:22:34 +0100
Subject: [PATCH 063/157] move emptying cache to accelertor

---
 pytorch_lightning/accelerators/accelerator.py   | 7 +++++++
 pytorch_lightning/accelerators/data_parallel.py | 7 -------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 242be59c082bf..a370106773e71 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -283,6 +283,10 @@ def on_train_start(self):
         with torch.cuda.device(self.root_device):
             torch.cuda.empty_cache()
 
+    def on_train_end(self):
+        # clean up memory
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()
 
 # TODO: Complete the TPUAccelerator
 class NewTPUAccelerator(NewAccelerator):
@@ -291,3 +295,6 @@ def setup(self, trainer, model):
 
     def on_train_start(self):
         raise NotImplementedError
+
+    def on_train_end(self):
+        raise NotImplementedError
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index b5f774f9b7bed..73b77c65cf775 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -467,9 +467,6 @@ def pre_training(self):
         self.barrier()
 
     def post_training(self, best_model_path):
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
-
         if "WORLD_SIZE" in os.environ:
             del os.environ["WORLD_SIZE"]
 
@@ -598,10 +595,6 @@ def new_process(self, process_idx, trainer):
         self.transfer_distrib_spawn_state_on_fit_end(results)
 
     def post_training(self, best_model_path):
-        # clean up memory
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
-
         # restore main state with best weights
         best_path = self.mp_queue.get()
         results = self.mp_queue.get()

From bcc7a72de742c1435ee2cad63abeea4a6d5cb902 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 08:45:15 +0100
Subject: [PATCH 064/157] fix a few tests

---
 pytorch_lightning/accelerators/base_plugin.py  |  2 +-
 .../accelerators/data_parallel.py              | 18 +++++++++---------
 pytorch_lightning/trainer/properties.py        |  9 +++++++++
 pytorch_lightning/trainer/trainer.py           |  6 +++---
 tests/trainer/test_dataloaders.py              |  2 +-
 5 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py
index 549d311f7f87d..3ecfb48726f76 100644
--- a/pytorch_lightning/accelerators/base_plugin.py
+++ b/pytorch_lightning/accelerators/base_plugin.py
@@ -15,7 +15,7 @@ def post_optimizer_step(self, optimizer, optimizer_idx):
     def pre_training(self):
         pass
 
-    def post_training(self, best_model_path):
+    def post_training(self):
         pass
 
     @contextlib.contextmanager
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 73b77c65cf775..60f61b65bf8c7 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -64,6 +64,10 @@ def model_to_device(self):
     def is_global_zero(self):
         raise NotImplementedError
 
+    @abstractmethod
+    def reduce(self, output, *args, **kwargs):
+        raise NotImplementedError
+
     @abstractmethod
     def barrier(self, name: Optional[str] = None):
         raise NotImplementedError
@@ -133,7 +137,7 @@ def __init__(self, device):
     def on_gpu(self):
         return self.device.type == "cuda" and torch.cuda.is_available()
 
-    def reduce(self, output):
+    def reduce(self, output, *args, **kwargs):
         return output
 
     @property
@@ -170,10 +174,6 @@ def __init__(self, parallel_devices: List[torch.device], cluster_environment=Non
         self.world_size = 1
         self.cluster_environment = cluster_environment
 
-    @abstractmethod
-    def reduce(self, output):
-        raise NotImplementedError
-
     @property
     @abstractmethod
     def root_device(self):
@@ -187,7 +187,7 @@ def on_gpu(self):
     def setup(self, model):
         raise NotImplementedError
 
-    def connect(self, model):
+    def connect(self, model, *args, **kwargs):
         self.setup(model)
         return self.model
 
@@ -226,7 +226,7 @@ class DataParallelPlugin(ParallelPlugin):
     def setup(self, model):
         self._model = LightningDataParallel(model, self.parallel_devices)
 
-    def reduce(self, output):
+    def reduce(self, output, *args, **kwargs):
         if isinstance(output, Result):
             output.dp_reduce()
 
@@ -466,7 +466,7 @@ def pre_training(self):
 
         self.barrier()
 
-    def post_training(self, best_model_path):
+    def post_training(self):
         if "WORLD_SIZE" in os.environ:
             del os.environ["WORLD_SIZE"]
 
@@ -594,7 +594,7 @@ def new_process(self, process_idx, trainer):
         # persist info in ddp_spawn
         self.transfer_distrib_spawn_state_on_fit_end(results)
 
-    def post_training(self, best_model_path):
+    def post_training(self):
         # restore main state with best weights
         best_path = self.mp_queue.get()
         results = self.mp_queue.get()
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 97d9885e57f32..0a85a4a298ae3 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -66,6 +66,7 @@ class TrainerProperties(ABC):
     accelerator_backend: NewAccelerator
     num_nodes: int
     num_processes: int
+    accelerator_connector: BackendConnector
 
     @property
     def accelerator(self):
@@ -141,6 +142,14 @@ def num_nodes(self):
     def num_processes(self):
         return self.accelerator_connector.num_processes
 
+    @property
+    def root_gpu(self):
+        return self.accelerator_connector.root_gpu
+
+    @property
+    def data_parallel_device_ids(self):
+        return self.accelerator_connector.parallel_device_ids
+
     @property
     def log_dir(self):
         if self.checkpoint_callback is not None:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 60e5a93b97d4e..0bae9a788c10c 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -540,7 +540,7 @@ def fit(
         else:
             self.training_type_plugin.start_training(self)
 
-        results = self.training_type_plugin.post_training(self.checkpoint_callback.best_model_path)
+        results = self.training_type_plugin.post_training()
         self.accelerator_backend.teardown()
 
         # ----------------------------
@@ -900,8 +900,8 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
                     f"specify a path for a checkpoint .test(ckpt_path=PATH)"
                 )
                 return {}
-            if self.accelerator_backend is not None and not self._device_type == DeviceType.TPU:
-                self.accelerator_backend.barrier()
+            if not self._device_type == DeviceType.TPU:
+                self.training_type_plugin.barrier()
 
             ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt["state_dict"])
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index a93a722bba597..42d9072e476d6 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -129,7 +129,7 @@ def test_multiple_val_dataloader(tmpdir):
 
     # make sure predictions are good for each val set
     for dataloader in trainer.val_dataloaders:
-        tpipes.run_prediction(trainer.model, dataloader)
+        tpipes.run_prediction(dataloader, model)
 
 
 @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific'])

From 259c7f72b4fd6006dd9d117d84fac63fc5f51e3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 09:05:25 +0100
Subject: [PATCH 065/157] restoring the result from subprocess

---
 .../accelerators/data_parallel.py             | 27 ++++++++++++-------
 pytorch_lightning/trainer/trainer.py          |  3 ++-
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 60f61b65bf8c7..4f7984d25c77f 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -43,6 +43,7 @@ class ReduceOp:
 class TrainingTypePlugin(Plugin, ABC):
     def __init__(self):
         self._model = None
+        self._results = None
         self.global_rank = 0
 
     @property
@@ -76,6 +77,7 @@ def barrier(self, name: Optional[str] = None):
     def broadcast(self, obj: object, src: int = 0) -> object:
         raise NotImplementedError
 
+    # TODO method this is currently unused
     def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids):
         if device_ids is None:
             return
@@ -115,17 +117,26 @@ def model(self, new_model):
     def lightning_module(self):
         return self._model
 
+    @property
+    def results(self):
+        """
+        The results of the last training/testing run will be cached here.
+        In distributed training, we make sure to transfer the results to the appropriate master process.
+        """
+        # TODO: improve these docs
+        return self._results
+
+    @property
+    def rpc_enabled(self):
+        return False
+
     def start_training(self, trainer):
         # double dispatch to initiate the training loop
-        return trainer.train()
+        self._results = trainer.train()
 
     def start_testing(self, trainer):
         # double dispatch to initiate the test loop
-        return trainer.run_test()
-
-    @property
-    def rpc_enabled(self):
-        return False
+        self._results = trainer.run_test()
 
 
 class SingleDevicePlugin(TrainingTypePlugin):
@@ -597,12 +608,10 @@ def new_process(self, process_idx, trainer):
     def post_training(self):
         # restore main state with best weights
         best_path = self.mp_queue.get()
-        results = self.mp_queue.get()
         last_path = self.mp_queue.get()
-
+        self._results = self.mp_queue.get()
         # recover the weights of the processes trained in the children
         self.__recover_child_process_weights(best_path, last_path)
-        return results
 
     def configure_ddp(self):
         # if unset, default `find_unused_parameters` `True`
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 0bae9a788c10c..ce1741ecfbbb6 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -540,8 +540,9 @@ def fit(
         else:
             self.training_type_plugin.start_training(self)
 
-        results = self.training_type_plugin.post_training()
+        self.training_type_plugin.post_training()
         self.accelerator_backend.teardown()
+        results = self.training_type_plugin.results
 
         # ----------------------------
         # POST-Training CLEAN UP

From dfab52a001f5acb73bcb9c91cea2ec6227a57349 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 09:08:34 +0100
Subject: [PATCH 066/157] fix queue.get() order for results

---
 pytorch_lightning/accelerators/data_parallel.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 4f7984d25c77f..56806f604f53e 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -610,6 +610,7 @@ def post_training(self):
         best_path = self.mp_queue.get()
         last_path = self.mp_queue.get()
         self._results = self.mp_queue.get()
+
         # recover the weights of the processes trained in the children
         self.__recover_child_process_weights(best_path, last_path)
 
@@ -644,9 +645,6 @@ def transfer_distrib_spawn_state_on_fit_end(self, results):
 
         if self.global_rank == 0 and self.mp_queue is not None:
             rank_zero_warn('cleaning up ddp environment...')
-            # todo, pass complete checkpoint as state dictionary
-            self.mp_queue.put(best_model_path)
-            self.mp_queue.put(results)
 
             # save the last weights
             last_path = None
@@ -654,7 +652,11 @@ def transfer_distrib_spawn_state_on_fit_end(self, results):
             if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
                 last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
                 atomic_save(self.lightning_module.state_dict(), last_path)
+
+            # todo, pass complete checkpoint as state dictionary
+            self.mp_queue.put(best_model_path)
             self.mp_queue.put(last_path)
+            self.mp_queue.put(results)
 
     def __recover_child_process_weights(self, best_path, last_path):
         # TODO: is there a better way than accessing callback through model -> trainer -> callback?

From 6742488d0210b57105ebc5a64e7f59e60d76e8f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 09:29:52 +0100
Subject: [PATCH 067/157] add missing "block_backward_sync" context manager

---
 pytorch_lightning/accelerators/data_parallel.py | 15 ++++++++++++++-
 pytorch_lightning/trainer/training_loop.py      |  5 +++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 56806f604f53e..4ccca43cc0902 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -1,5 +1,7 @@
 from abc import ABC, abstractmethod
 import re
+from contextlib import contextmanager
+
 from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
 from pytorch_lightning.accelerators.base_plugin import Plugin
 
@@ -20,7 +22,6 @@
 import numpy as np
 import torch.distributed as torch_distrib
 from pytorch_lightning import _logger as log
-import contextlib
 import torch.multiprocessing as mp
 from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info
 
@@ -231,6 +232,18 @@ def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
         return model
 
+    @contextmanager
+    def block_backward_sync(self):
+        """
+        Blocks ddp sync gradients behaviour on backwards pass.
+        This is useful for skipping sync when accumulating gradients, reducing communication overhead
+        Returns: context manager with sync behaviour off
+        """
+        if isinstance(self.model, LightningDistributedDataParallel):
+            yield self.model.no_sync()
+        else:
+            yield None
+
 
 class DataParallelPlugin(ParallelPlugin):
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index bc42de5aed110..65437ebc5e5dd 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -18,6 +18,7 @@
 import numpy as np
 import torch
 
+from pytorch_lightning.accelerators.data_parallel import ParallelPlugin
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.memory import ModelSummary
@@ -719,8 +720,8 @@ def block_ddp_sync_behaviour(self):
         Returns: context manager with sync behaviour off
 
         """
-        if self.trainer.accelerator_backend is not None and self.automatic_optimization:
-            yield self.trainer.accelerator_backend.block_ddp_plugin_sync_behaviour()
+        if isinstance(self.trainer.training_type_plugin, ParallelPlugin) is not None and self.automatic_optimization:
+            yield self.trainer.training_type_plugin.block_backward_sync()
         else:
             yield None
 

From 8c89932458867ee3d48bf1412afc063e0e069307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 09:31:16 +0100
Subject: [PATCH 068/157] add missing "block_backward_sync" context manager

---
 pytorch_lightning/trainer/training_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 65437ebc5e5dd..7c010ba72c137 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -720,7 +720,7 @@ def block_ddp_sync_behaviour(self):
         Returns: context manager with sync behaviour off
 
         """
-        if isinstance(self.trainer.training_type_plugin, ParallelPlugin) is not None and self.automatic_optimization:
+        if isinstance(self.trainer.training_type_plugin, ParallelPlugin) and self.automatic_optimization:
             yield self.trainer.training_type_plugin.block_backward_sync()
         else:
             yield None

From 0186a0fa5e9fe145118bbee055709024fb2336f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 10:06:11 +0100
Subject: [PATCH 069/157] fix sync_batchnorm

---
 .../accelerators/accelerator_connector.py              |  2 ++
 pytorch_lightning/accelerators/data_parallel.py        | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index ad012ee1f6ead..91bad5fc5f373 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -190,6 +190,7 @@ def select_training_type_plugin(self):
                 num_nodes=self.num_nodes,
                 cluster_environment=cluster_environment,
                 is_slurm_managing_tasks=self.is_slurm_managing_tasks,
+                sync_batchnorm=self.sync_batchnorm,
             )
         elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"):
             plugin = DDPSpawnPlugin(
@@ -197,6 +198,7 @@ def select_training_type_plugin(self):
                 num_nodes=self.num_nodes,
                 cluster_environment=cluster_environment,
                 is_slurm_managing_tasks=self.is_slurm_managing_tasks,
+                sync_batchnorm=self.sync_batchnorm,
             )
         else:
             # TODO: cover all other cases
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 4ccca43cc0902..b8290ae4b1cd8 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -288,12 +288,14 @@ def __init__(
             num_nodes=1,
             cluster_environment=None,
             is_slurm_managing_tasks=False,
+            sync_batchnorm=False,
             **kwargs: Dict[str, Any],
     ) -> None:
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.interactive_ddp_procs = []
         self.num_nodes = num_nodes
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
+        self.sync_batchnorm = sync_batchnorm
         self.dist = LightningDistributed()
         self._ddp_kwargs = kwargs
         self._has_spawned_children = False
@@ -481,7 +483,8 @@ def pre_training(self):
         self.dist.rank = self.global_rank
         self.dist.device = self.root_device
 
-        self.model = self.configure_sync_batchnorm(self.model)
+        if self.sync_batchnorm:
+            self.model = self.configure_sync_batchnorm(self.model)
 
         # move the model to the correct device
         self.model_to_device()
@@ -522,11 +525,13 @@ def __init__(
         num_nodes=1,
         cluster_environment=None,
         is_slurm_managing_tasks=False,
+        sync_batchnorm=False,
         **kwargs: Dict[str, Any]
     ):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.num_nodes = num_nodes
         self.is_slurm_managing_tasks = is_slurm_managing_tasks
+        self.sync_batchnorm = sync_batchnorm
         self._ddp_kwargs = kwargs
         self.dist = LightningDistributed()
         self.num_processes = len(parallel_devices)
@@ -601,7 +606,8 @@ def new_process(self, process_idx, trainer):
         self.dist.rank = self.global_rank
         self.dist.device = self.root_device
 
-        self.model = self.configure_sync_batchnorm(self.model)
+        if self.sync_batchnorm:
+            self.model = self.configure_sync_batchnorm(self.model)
 
         # move the model to the correct device
         self.model_to_device()

From b2ac1f401fc14343d8a037bae58e7386cf9430d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 10:10:49 +0100
Subject: [PATCH 070/157] fix supported gpu-ids for tuple

---
 tests/models/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 4bf854da4b8d8..5643dce5a6160 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -162,6 +162,7 @@ def test_determine_root_gpu_device(gpus, expected_root_gpu):
     pytest.param(-1, list(range(PRETEND_N_OF_GPUS)), id="-1 - use all gpus"),
     pytest.param([0], [0]),
     pytest.param([1, 3], [1, 3]),
+    pytest.param((1, 3), [1, 3]),
     pytest.param('0', [0]),
     pytest.param('3', [3]),
     pytest.param('1, 3', [1, 3]),
@@ -181,7 +182,6 @@ def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids):
     pytest.param([-1]),
     pytest.param([None]),
     pytest.param(['0']),
-    pytest.param((0, 1)),
 ])
 def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus):
     with pytest.raises(MisconfigurationException):

From 07a41ce9226f3c241424dc7429536a91f8d901b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 22 Dec 2020 12:05:33 +0100
Subject: [PATCH 071/157] fix clip gradients and inf recursion

---
 pytorch_lightning/accelerators/accelerator.py | 13 ++++++++-----
 pytorch_lightning/accelerators/precision.py   |  3 +++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index a370106773e71..d2c040a30d9e9 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -150,16 +150,19 @@ def _clip_gradients(self, optimizer, grad_clip_val):
 
         if grad_clip_val <= 0:
             return
-        self._clip_gradients(optimizer, grad_clip_val)
 
         model = self.lightning_module
 
         # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX
+
+        # if self.trainer.amp_backend == AMPType.APEX:
+        #     parameters = self.precision_plugin.master_params(optimizer)
+        # else:
+        #     parameters = model.parameters()
+
+        # TODO
         #  ... or we call master_params() and in the default plugin we return the model.parameters()
-        if self.trainer.amp_backend == AMPType.APEX:
-            parameters = self.precision_plugin.master_params(optimizer)
-        else:
-            parameters = model.parameters()
+        parameters = self.precision_plugin.master_params(optimizer)
 
         max_norm = grad_clip_val
         norm_type = float(2.0)
diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
index 3ce68c8e1efc6..a2ee98b686bae 100644
--- a/pytorch_lightning/accelerators/precision.py
+++ b/pytorch_lightning/accelerators/precision.py
@@ -112,6 +112,9 @@ def __init__(self, amp_level):
         self.backend = AMPType.APEX
         self.amp_level = amp_level
 
+    def master_params(self, optimizer):
+        return amp.master_params(optimizer)
+
     def connect(self, model, optimizers, lr_schedulers):
         model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level)
         reinit_scheduler_properties(optimizers, lr_schedulers)

From 63b7eafa03c0bdafe8dc0fe6ed54680a3a5c2295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 23 Dec 2020 12:11:32 +0100
Subject: [PATCH 072/157] accelerator selection: added cluster_environment
 plugin

---
 .../accelerators/accelerator_connector.py     |  60 +++---
 .../accelerators/data_parallel.py             |   4 +-
 pytorch_lightning/plugins/plugin_connector.py |  19 +-
 pytorch_lightning/trainer/properties.py       |   4 +
 pytorch_lightning/trainer/trainer.py          |  11 +-
 tests/backends/test_accelerator_connector.py  | 175 ++++++++----------
 6 files changed, 137 insertions(+), 136 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 91bad5fc5f373..935548b9fd6e3 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -18,7 +18,7 @@
 
 from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
-    DataParallelPlugin
+    DataParallelPlugin, DDP2Plugin
 from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
@@ -59,6 +59,7 @@ def __init__(
         precision,
         amp_type, 
         amp_level,
+        cluster_environment,
     ):
 
         # initialization
@@ -81,6 +82,7 @@ def __init__(
         self.precision = precision
         self.amp_type = None if amp_type is None else amp_type.lower()
         self.amp_level = amp_level
+        self.cluster_environment = cluster_environment
         self.is_slurm_managing_tasks = False
 
         # init the default rank if exists
@@ -152,6 +154,11 @@ def parallel_devices(self):
             devices = [torch.device("cpu")] * self.num_processes
         return devices
 
+    @property
+    def is_using_torchelastic(self):
+        te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
+        return te_flags_passed
+
     def select_precision_plugin(self):
         if self.precision == 32:
             self.amp_type = None
@@ -182,26 +189,43 @@ def select_precision_plugin(self):
 
     def select_training_type_plugin(self):
         cluster_environment = self.select_cluster_environment()
-        if self.use_dp and self.distributed_backend == "dp":
-            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
-        elif self.use_ddp and self.distributed_backend == "ddp":
-            plugin = DDPPlugin(
+        if self.use_ddp2:
+            plugin = DDP2Plugin(
                 parallel_devices=self.parallel_devices,
-                num_nodes=self.num_nodes,
-                cluster_environment=cluster_environment,
-                is_slurm_managing_tasks=self.is_slurm_managing_tasks,
-                sync_batchnorm=self.sync_batchnorm,
+                cluster_environment=cluster_environment
             )
-        elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"):
-            plugin = DDPSpawnPlugin(
+        elif self.use_ddp:
+            use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
+            use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic
+            use_ddp_spawn = self.use_ddp and self.distributed_backend == "ddp_spawn"
+            use_ddp_cpu_spawn = self.use_ddp and self.distributed_backend == "ddp_cpu"
+            use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
+            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
+
+            # ddp script mode uses the same flags as TE
+            # TODO: decouple from TE
+            if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
+                use_torchelastic_ddp = False
+
+            if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
+                ddp_plugin_cls = DDPPlugin
+            elif use_ddp_spawn or use_ddp_cpu_spawn:
+                ddp_plugin_cls = DDPSpawnPlugin
+            else:
+                ddp_plugin_cls = DDPPlugin
+
+            plugin = ddp_plugin_cls(
                 parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
                 cluster_environment=cluster_environment,
                 is_slurm_managing_tasks=self.is_slurm_managing_tasks,
                 sync_batchnorm=self.sync_batchnorm,
             )
+        elif self.use_dp:
+            plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
+        elif self.use_horovod:
+            raise NotImplementedError
         else:
-            # TODO: cover all other cases
             plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
         return plugin
 
@@ -221,22 +245,16 @@ def select_accelerator(self):
         )
 
     def select_cluster_environment(self):
-        # TODO: support the cloud environment set by the plugin connector!
-        # if self.trainer.plugin_connector.cloud_environment:
-        #     env = self.trainer.plugin_connector.cloud_environment
-        # elif self.is_slurm_managing_tasks:
+        if self.cluster_environment is not None:
+            return self.cluster_environment
         if self.is_slurm_managing_tasks:
             env = SLURMEnvironment()
-        elif self._is_using_torchelastic():
+        elif self.is_using_torchelastic:
             env = TorchElasticEnvironment()
         else:
             env = TorchElasticEnvironment()
         return env
 
-    def _is_using_torchelastic(self):
-        te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
-        return te_flags_passed
-
     def set_distributed_mode(self):
 
         # No distributed backend
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index b8290ae4b1cd8..ba28732336430 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -339,7 +339,7 @@ def setup(self, model):
             self._call_children_scripts()
 
         # set the task idx
-        self.task_idx = int(os.environ["LOCAL_RANK"])
+        self.task_idx = self.cluster_environment.local_rank()
 
     def _call_children_scripts(self):
 
@@ -721,3 +721,5 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
 
 
 # TODO: DDP2 (?), HOROVOD DDP AND HPC DDP
+class DDP2Plugin(DDPPlugin):
+    pass
diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/plugin_connector.py
index ccd128d87a26a..e1071fa24ec04 100644
--- a/pytorch_lightning/plugins/plugin_connector.py
+++ b/pytorch_lightning/plugins/plugin_connector.py
@@ -26,20 +26,21 @@
 
 class PluginConnector:
 
-    def __init__(self, trainer):
+    def __init__(self, trainer, plugins: Optional[Union[str, list]]):
         self.trainer = trainer
-        self.plugins = []
+        self.plugins = plugins or []
         self.ddp_plugin = DDPPlugin()
         self.cloud_environment = None
-
-    def on_trainer_init(self, plugins: Optional[Union[str, list]]):
-        self.plugins = plugins
-        if self.plugins is None:
-            self.plugins = []
+        self.amp_plugin = NativeAMPPlugin(trainer)
+        self.apex_plugin = ApexPlugin(trainer)
         self.plugins = self._convert_str_custom_plugins(self.plugins)
-        self.plugins = self._append_required_plugins(self.plugins)
-        self.__attach_ddp()
+        # TODO: do we need this?
+        #self self.plugins = self._append_required_plugins(self.plugins)
         self.__attach_cluster()
+        # TODO: attach training_type_plugin
+
+    def on_trainer_init(self):
+        self.__attach_ddp()
         self.__attach_amp()
         self.__attach_apex()
 
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 0a85a4a298ae3..bb7559f503b25 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -86,6 +86,10 @@ def distributed_backend(self):
     def training_type_plugin(self):
         return self.accelerator.training_type_plugin
 
+    @property
+    def precision_plugin(self):
+        return self.accelerator.precision_plugin
+
     @property
     def global_rank(self):
         return self.accelerator.training_type_plugin.global_rank
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ce1741ecfbbb6..fa1e853153853 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -322,6 +322,7 @@ def __init__(
         self.config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self)
         self.optimizer_connector = OptimizerConnector(self)
+        self.plugin_connector = PluginConnector(self, plugins)
         self.accelerator_connector = BackendConnector(
             num_processes,
             tpu_cores,
@@ -336,6 +337,7 @@ def __init__(
             precision,
             amp_backend,
             amp_level,
+            self.plugin_connector.cloud_environment
         )
         self.logger_connector = LoggerConnector(self, log_gpu_memory)
         self.model_connector = ModelConnector(self)
@@ -349,7 +351,6 @@ def __init__(
         self.tuner = Tuner(self)
         self.evaluation_loop = EvaluationLoop(self)
         self.train_loop = TrainLoop(self, multiple_trainloader_mode)
-        self.plugin_connector = PluginConnector(self)
 
         # training state
         self.model = None
@@ -431,7 +432,8 @@ def __init__(
         # self.precision_connector.on_trainer_init(precision, amp_level, amp_backend)
 
         # last thing are the plugins which override whatever the trainer used by default
-        self.plugin_connector.on_trainer_init(plugins)
+        # TODO: probably not needed anymore after refactor
+        self.plugin_connector.on_trainer_init()
 
         # Callback system
         self.on_init_end()
@@ -517,7 +519,6 @@ def fit(
         self.call_hook("on_before_accelerator_backend_setup", model)
         self.accelerator_backend.setup(self, model)
         self.train_loop.setup_training(model)
-        self.training_type_plugin.pre_training()
 
         # ----------------------------
         # INSPECT THESE FOR MAIN LOOPS
@@ -531,9 +532,11 @@ def fit(
         # TRAIN
         # ----------------------------
         # hook
-
         self.call_hook("on_fit_start")
 
+        # plugin will setup training (e.g. ddp will launch child processes)
+        self.training_type_plugin.pre_training()
+
         # double dispatch: let the plugin initiate the training/test loop.
         if self.testing:
             self.training_type_plugin.start_testing(self)
diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index 1dddd48ea0d25..37a1911be38d3 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -16,94 +16,59 @@
 from unittest import mock
 
 import pytest
+import torch
 
-from pytorch_lightning import Trainer, accelerators
-from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator
-from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin
-from pytorch_lightning.accelerators.old.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewGPUAccelerator, NewAccelerator
+from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin
+from pytorch_lightning.accelerators.precision import PrecisionPlugin
+from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
-from pytorch_lightning.utilities import DistributedType
 from tests.base.boring_model import BoringModel
 
 
 def test_accelerator_choice_cpu(tmpdir):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
-            assert isinstance(trainer.accelerator_backend.training_type_plugin, SingleDevicePlugin)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-
-    model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
-        callbacks=[CB()]
     )
-    trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, SingleDevicePlugin)
 
 
 def test_accelerator_choice_ddp_cpu(tmpdir):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSpawnAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            raise SystemExit()
-
-    model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_cpu',
-        num_processes=2,
-        callbacks=[CB()],
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
 
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @mock.patch('torch.cuda.device_count', return_value=2)
 def test_accelerator_choice_ddp(tmpdir):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            raise SystemExit()
-
-    model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp',
         gpus=1,
-        callbacks=[CB()],
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
 
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @mock.patch('torch.cuda.device_count', return_value=2)
 def test_accelerator_choice_ddp_spawn(tmpdir):
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPSpawnAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            raise SystemExit()
-
-    model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_spawn',
         gpus=1,
-        callbacks=[CB()],
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
 
 @mock.patch.dict(os.environ, {
@@ -117,11 +82,13 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp_slurm(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
+            assert trainer.use_ddp
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.task_idx == 10
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -148,11 +115,13 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp2_slurm(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type == DistributedType.DDP2
-            assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
+            assert trainer.use_ddp2
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.task_idx == 10
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
 
             raise SystemExit()
 
@@ -178,11 +147,12 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp_te(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
+            assert trainer.use_ddp
+            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.task_idx == 10
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -207,11 +177,12 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp2_te(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type == DistributedType.DDP2
-            assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
+            assert trainer.use_ddp2
+            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.task_idx == 10
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -235,12 +206,12 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp_cpu_te(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
-
+            assert trainer.use_ddp
+            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.task_idx == 10
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -266,9 +237,11 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp_cpu_slurm(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment)
+            assert trainer.use_ddp
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
             raise SystemExit()
 
     model = BoringModel()
@@ -302,9 +275,10 @@ def master_address(self):
 
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, CustomCluster)
+            assert trainer.use_ddp
+            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster)
             raise SystemExit()
 
     model = BoringModel()
@@ -329,29 +303,27 @@ def on_fit_start(self, trainer, pl_module):
 })
 @mock.patch('torch.cuda.device_count', return_value=0)
 def test_custom_accelerator(tmpdir):
-    class Accel(Accelerator):
-        def init_ddp_connection(
-                self,
-                global_rank: int,
-                world_size: int,
-                is_slurm_managing_tasks: bool = True) -> None:
-            pass
+    class Accel(NewAccelerator):
+        pass
 
-    class CB(Callback):
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, Accel)
-            raise SystemExit()
+    class Prec(PrecisionPlugin):
+        pass
 
-    model = BoringModel()
+    class TrainTypePlugin(SingleDevicePlugin):
+        pass
+
+    accelerator = Accel(
+        training_type_plugin=TrainTypePlugin(device=torch.device("cpu")),
+        precision_plugin=Prec(),
+    )
     trainer = Trainer(
+        accelerator=accelerator,
         fast_dev_run=True,
-        accelerator=Accel(),
         num_processes=2,
-        callbacks=[CB()]
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, Accel)
+    assert isinstance(trainer.training_type_plugin, TrainTypePlugin)
+    assert isinstance(trainer.precision_plugin, Prec)
 
 
 @mock.patch.dict(os.environ, {
@@ -365,7 +337,8 @@ def on_fit_start(self, trainer, pl_module):
 def test_dist_backend_accelerator_mapping(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
+            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
             raise SystemExit()
 
     model = BoringModel()

From f8344c5afe7bcfee3b942c3ba6084878ae0ec829 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 23 Dec 2020 13:10:34 +0100
Subject: [PATCH 073/157] fix torchelastic test

---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 ++++
 pytorch_lightning/accelerators/data_parallel.py         | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 935548b9fd6e3..3733fad589921 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -251,7 +251,11 @@ def select_cluster_environment(self):
             env = SLURMEnvironment()
         elif self.is_using_torchelastic:
             env = TorchElasticEnvironment()
+            # TODO: decouple DDP from TE
+            #   maybe introduce a DefaultEnvironment?
+            os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
         else:
+            # TODO: maybe introduce a DefaultEnvironment?
             env = TorchElasticEnvironment()
         return env
 
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index ba28732336430..ab94eea92b3f5 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -2,6 +2,7 @@
 import re
 from contextlib import contextmanager
 
+from pytorch_lightning.cluster_environments import TorchElasticEnvironment
 from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
 from pytorch_lightning.accelerators.base_plugin import Plugin
 
@@ -335,6 +336,7 @@ def setup(self, model):
         self._model = model
 
         # start the other scripts
+        # TODO: make sure this works, in torchelastic we should not launch child processes!
         if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
             self._call_children_scripts()
 

From 34e3c15c18d9fd48c63e114ef651595b71c8ddf1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 24 Dec 2020 08:04:33 +0100
Subject: [PATCH 074/157] fix reduce early stopping decision for DDP

---
 pytorch_lightning/accelerators/accelerator.py   |  4 ----
 pytorch_lightning/accelerators/data_parallel.py | 12 ++++++++++++
 pytorch_lightning/callbacks/early_stopping.py   |  3 ++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index d2c040a30d9e9..9a3824b794089 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -194,10 +194,6 @@ def on_train_epoch_end(self, outputs):
     def on_train_end(self):
         pass
 
-    # TODO: Check if we can change logic for early stopping to accelerator/trainer completely or have a separate connector (should be self contained)
-    def early_stopping_should_stop(self, pl_module):
-        return self.trainer.should_stop
-
     def setup_optimizers(self, trainer, model):
         if trainer.testing is True:
             return
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index ab94eea92b3f5..eeb14380402d6 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -107,6 +107,9 @@ def determine_node_rank(self):
         rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
         return int(rank)
 
+    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
+        return should_stop
+
     @property
     def model(self):
         return self._model
@@ -216,6 +219,12 @@ def distributed_sampler_kwargs(self):
         )
         return distributed_sampler_kwargs
 
+    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
+        should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device)
+        should_stop = self.reduce(should_stop, reduce_op=ReduceOp.SUM)
+        should_stop = bool(should_stop == self.world_size)
+        return should_stop
+
     @staticmethod
     def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
         """
@@ -278,6 +287,9 @@ def barrier(self, *args, **kwargs):
     def broadcast(self, obj: object, src: int = 0) -> object:
         return obj
 
+    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
+        return should_stop
+
 
 class DDPPlugin(ParallelPlugin):
 
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index ec44a1eeb416b..d39e600820735 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -188,6 +188,7 @@ def _run_early_stopping_check(self, trainer, pl_module):
             return  # short circuit if metric not present
 
         current = logs.get(self.monitor)
+        should_stop = False
 
         # when in dev debugging
         trainer.dev_debugger.track_early_stopping_history(self, current)
@@ -204,5 +205,5 @@ def _run_early_stopping_check(self, trainer, pl_module):
                 trainer.should_stop = True
 
         # stop every ddp process if any world process decides to stop
-        should_stop = trainer.accelerator_backend.early_stopping_should_stop(pl_module)
+        should_stop = trainer.training_type_plugin.reduce_early_stopping_decision(should_stop)
         trainer.should_stop = should_stop

From 27a4cff940efc305b0a573f4b7d2e40c0aae2b97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 24 Dec 2020 10:05:56 +0100
Subject: [PATCH 075/157] fix tests: callbacks, conversion to lightning
 optimizer

---
 pytorch_lightning/accelerators/accelerator.py |  1 +
 .../accelerators/data_parallel.py             |  8 +++---
 pytorch_lightning/trainer/optimizers.py       |  5 ++--
 pytorch_lightning/trainer/properties.py       | 25 +++++++++++++------
 pytorch_lightning/trainer/trainer.py          |  2 ++
 tests/callbacks/test_callbacks.py             |  9 +++----
 tests/models/test_hooks.py                    |  4 +--
 7 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 9a3824b794089..8c1bfdc9301cb 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -36,6 +36,7 @@ def setup(self, trainer, model):
         self.connect_training_type_plugin(self.training_type_plugin, model)
         self.setup_optimizers(trainer, model)
         self.connect_precision_plugin(self.precision_plugin)
+        self.optimizers = trainer.convert_to_lightning_optimizers(self.optimizers)
 
     @property
     def model(self):
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index eeb14380402d6..dcc6e4b139406 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -480,10 +480,10 @@ def pre_training(self):
         # set up server using proc 0's ip address
         # try to init for 20 times at max in case ports are taken
         # where to store ip_table
-        # TODO: CHeck is_slurm_managing_tasks
         self.init_ddp_connection(self.global_rank, self.world_size)
 
-        # TODO: Move this somewhere else
+        # TODO: we moved it to the trainer.fit after calling pre_training
+        #   ... need to double check that it is the correct place
         # self.trainer.call_setup_hook(self.model)
 
         # on world_size=0 let everyone know training is starting
@@ -603,10 +603,10 @@ def new_process(self, process_idx, trainer):
         # set up server using proc 0's ip address
         # try to init for 20 times at max in case ports are taken
         # where to store ip_table
-        # TODO: CHeck is_slurm_managing_tasks
         self.init_ddp_connection(self.global_rank, self.world_size)
 
-        # TODO: Move this somewhere else
+        # TODO: we moved it to the trainer.fit after calling pre_training
+        #   ... need to double check that it is the correct place
         # self.trainer.call_setup_hook(self.model)
 
         # on world_size=0 let everyone know training is starting
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index 919042516ad50..e56856dfb2b4f 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -81,7 +81,7 @@ def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]:
 
         return optimizers, lr_schedulers, optimizer_frequencies
 
-    def convert_to_lightning_optimizers(self):
+    def convert_to_lightning_optimizers(self, optimizers):
         def _convert_to_lightning_optimizer(trainer, optimizer):
             if not isinstance(optimizer, LightningOptimizer):
                 optimizer = LightningOptimizer(optimizer)
@@ -89,7 +89,8 @@ def _convert_to_lightning_optimizer(trainer, optimizer):
             return optimizer
 
         if self._enable_pl_optimizer:
-            self.optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in self.optimizers]
+            optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in optimizers]
+        return optimizers
 
     def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None):
         # Convert each scheduler into dict structure with relevant information
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index bb7559f503b25..e4a78704749fa 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -258,6 +258,10 @@ def match_env_arguments(cls) -> Namespace:
     def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser:
         return add_argparse_args(cls, parent_parser)
 
+    @property
+    def gpus(self) -> Optional[Union[List[int], str, int]]:
+        return self.accelerator_connector.gpus
+
     @property
     def num_gpus(self) -> int:
         return self.accelerator_connector.num_gpus
@@ -357,15 +361,20 @@ def get_model(self):
     def lightning_module(self):
         return self.training_type_plugin.lightning_module
 
-    def __getstate__(self):
-        # unwrap optimizer
-        self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers]
-        return self.__dict__
+    @property
+    def optimizers(self):
+        return self.accelerator.optimizers
 
-    def __setstate__(self, d):
-        self.__dict__ = d
-        # wrap optimizers in enable_pl_optimzer is True
-        self.convert_to_lightning_optimizers()
+    # TODO: Do we need getstate / setstate?
+    # def __getstate__(self):
+    #     # unwrap optimizer
+    #     self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers]
+    #     return self.__dict__
+    #
+    # def __setstate__(self, d):
+    #     self.__dict__ = d
+    #     # wrap optimizers in enable_pl_optimzer is True
+    #     self.convert_to_lightning_optimizers()
 
     @property
     def require_distributed_sampler(self):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index fa1e853153853..a0d62d2a1104d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -537,6 +537,8 @@ def fit(
         # plugin will setup training (e.g. ddp will launch child processes)
         self.training_type_plugin.pre_training()
 
+        self.call_setup_hook(self.lightning_module)
+
         # double dispatch: let the plugin initiate the training/test loop.
         if self.testing:
             self.training_type_plugin.start_testing(self)
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 53d6f80d9d7bf..f3e1dabfb6e59 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -56,8 +56,8 @@ def test_trainer_callback_system(torch_save):
         call.on_init_start(trainer),
         call.on_init_end(trainer),
         call.on_before_accelerator_backend_setup(trainer, model),
-        call.setup(trainer, model, 'fit'),
         call.on_fit_start(trainer, model),
+        call.setup(trainer, model, 'fit'),
         call.on_pretrain_routine_start(trainer, model),
         call.on_pretrain_routine_end(trainer, model),
         call.on_sanity_check_start(trainer, model),
@@ -110,11 +110,10 @@ def test_trainer_callback_system(torch_save):
     assert callback_mock.method_calls == [
         call.on_init_start(trainer),
         call.on_init_end(trainer),
-        call.on_before_accelerator_backend_setup(trainer, model),
-        call.setup(trainer, model, 'test'),
         call.on_fit_start(trainer, model),
-        call.on_pretrain_routine_start(trainer, model),
-        call.on_pretrain_routine_end(trainer, model),
+        call.setup(trainer, model, 'test'),
+        # call.on_pretrain_routine_start(trainer, model),
+        # call.on_pretrain_routine_end(trainer, model),
         call.on_test_start(trainer, model),
         call.on_test_epoch_start(trainer, model),
         call.on_test_batch_start(trainer, model, ANY, 0, 0),
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 0565ba594179f..72f0790ca3df3 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -356,8 +356,8 @@ def teardown(self, stage: str):
 
     expected = [
         'on_fit_start',
-        'on_pretrain_routine_start',
-        'on_pretrain_routine_end',
+        # 'on_pretrain_routine_start',
+        # 'on_pretrain_routine_end',
         'on_test_model_eval',
         'on_test_start',
         'on_test_epoch_start',

From df5ac30ba7450123d873abb1ec33deae534d79f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 24 Dec 2020 13:20:05 +0100
Subject: [PATCH 076/157] fix lightning optimizer does not pickle

---
 pytorch_lightning/trainer/properties.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index e4a78704749fa..f7daa1c44708c 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -365,16 +365,17 @@ def lightning_module(self):
     def optimizers(self):
         return self.accelerator.optimizers
 
-    # TODO: Do we need getstate / setstate?
-    # def __getstate__(self):
-    #     # unwrap optimizer
-    #     self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers]
-    #     return self.__dict__
-    #
-    # def __setstate__(self, d):
-    #     self.__dict__ = d
-    #     # wrap optimizers in enable_pl_optimzer is True
-    #     self.convert_to_lightning_optimizers()
+    # TODO: refactor this so that it can be done in LightningOptimizer
+    def __getstate__(self):
+        # unwrap optimizer
+        self.accelerator.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers]
+        return self.__dict__
+
+    # TODO: refactor this so that it can be done in LightningOptimizer
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # wrap optimizers if enable_pl_optimzer is True
+        self.accelerator.optimizers = self.convert_to_lightning_optimizers(self.optimizers)
 
     @property
     def require_distributed_sampler(self):

From dcf917ad6f4c25ce71495c8247144684ccb0c793 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 24 Dec 2020 14:22:59 +0100
Subject: [PATCH 077/157] fix setting benchmark and deterministic option

---
 .../accelerators/accelerator_connector.py           | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 3733fad589921..e89654416bbbe 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -118,6 +118,19 @@ def __init__(
         # NVIDIA setup
         # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
 
+        # benchmarking
+        # TODO: should this be moved to GPU accelerator?
+        torch.backends.cudnn.benchmark = self.benchmark
+
+        # determinism for cudnn
+        # TODO: should this be moved to GPU accelerator?
+        torch.backends.cudnn.deterministic = deterministic
+        if deterministic:
+            # fixing non-deterministic part of horovod
+            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
+            os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
+
+        # TODO: move this to TPU accelerator/plugin
         self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE")
 
         self.replace_sampler_ddp = replace_sampler_ddp

From 272f088581fa34b07ada2cd03c8ae97cd9d523fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 24 Dec 2020 14:49:13 +0100
Subject: [PATCH 078/157] fix slurm amp test

---
 .../cluster_environments/slurm_environment.py         |  4 ++--
 tests/models/test_amp.py                              | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py
index 870119414d27b..50da4bc42d5dc 100644
--- a/pytorch_lightning/cluster_environments/slurm_environment.py
+++ b/pytorch_lightning/cluster_environments/slurm_environment.py
@@ -32,7 +32,7 @@ def master_address(self):
         else:
             root_node = "127.0.0.1"
 
-        root_node = self._resolve_root_node_address(root_node)
+        root_node = self.resolve_root_node_address(root_node)
         os.environ["MASTER_ADDR"] = root_node
         log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
         return root_node
@@ -70,7 +70,7 @@ def world_size(self):
     def local_rank(self):
         return int(os.environ['SLURM_LOCALID'])
 
-    def _resolve_root_node_address(self, root_node):
+    def resolve_root_node_address(self, root_node):
         if '[' in root_node:
             name, numbers = root_node.split('[', maxsplit=1)
             number = numbers.split(',', maxsplit=1)[0]
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 55d32cc662701..ed2aa1ac99031 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -20,6 +20,8 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
+from pytorch_lightning.cluster_environments import SLURMEnvironment
+from pytorch_lightning.loggers import WandbLogger
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -138,10 +140,11 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
     assert trainer.state == TrainerState.FINISHED, 'amp + ddp model failed to complete'
 
     # test root model address
-    assert trainer.slurm_connector.resolve_root_node_address('abc') == 'abc'
-    assert trainer.slurm_connector.resolve_root_node_address('abc[23]') == 'abc23'
-    assert trainer.slurm_connector.resolve_root_node_address('abc[23-24]') == 'abc23'
-    assert trainer.slurm_connector.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23'
+    assert isinstance(trainer.accelerator_connector.cluster_environment, SLURMEnvironment)
+    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc') == 'abc'
+    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23'
+    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23'
+    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23'
 
 
 @pytest.mark.parametrize("enable_pl_optimizer", [False, True])

From 45294760f8f52fa10dfcb1673773829fbcc7b382 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 11:26:58 +0100
Subject: [PATCH 079/157] fix prepare_data test and determine node_rank

---
 .../accelerators/data_parallel.py             | 51 +++----------------
 .../cluster_environment.py                    |  7 ++-
 .../cluster_environments/slurm_environment.py |  3 ++
 .../torchelastic_environment.py               | 17 ++++++-
 pytorch_lightning/trainer/properties.py       |  5 ++
 tests/core/test_datamodules.py                | 28 ++++++----
 6 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index dcc6e4b139406..86ce580fdff79 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -90,23 +90,6 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids):
         devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')
 
-    def determine_local_rank(self):
-        return int(os.environ.get('LOCAL_RANK', 0))
-
-    def determine_node_rank(self):
-        # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK.
-        # otherwise use given node rank or default to node rank 0
-        env_vars = ['NODE_RANK', 'GROUP_RANK']
-        node_ids = [(k, os.environ.get(k, None)) for k in env_vars]
-        node_ids = [(k, v) for k, v in node_ids if v is not None]
-        if len(node_ids) == 0:
-            return 0
-        if len(node_ids) > 1:
-            log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.")
-        k, rank = node_ids.pop()
-        rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
-        return int(rank)
-
     def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
         return should_stop
 
@@ -313,6 +296,7 @@ def __init__(
         self._ddp_kwargs = kwargs
         self._has_spawned_children = False
         self.task_idx = None
+        self.node_rank = 0
         self.num_processes = len(parallel_devices)
 
     @property
@@ -332,18 +316,6 @@ def distributed_sampler_kwargs(self):
         )
         return distributed_sampler_kwargs
 
-    def determine_local_rank(self):
-        if self.is_slurm_managing_tasks:
-            return int(os.environ['SLURM_LOCALID'])
-        else:
-            return super().determine_node_rank()
-
-    def determine_node_rank(self):
-        if self.is_slurm_managing_tasks:
-            return int(os.environ['SLURM_NODEID'])
-        else:
-            return super().determine_node_rank()
-
     def setup(self, model):
         self._model = model
 
@@ -436,8 +408,8 @@ def _check_can_spawn_children(self):
 
     def set_world_ranks(self):
         self.local_rank = self.task_idx
-        # TODO: check from where we get node_rank and num_processes
-        self.global_rank = self.determine_node_rank() * self.num_processes + self.task_idx
+        self.node_rank = self.cluster_environment.node_rank()
+        self.global_rank = self.node_rank * self.num_processes + self.local_rank
         self.world_size = self.num_nodes * self.num_processes
 
     def configure_ddp(self):
@@ -549,6 +521,7 @@ def __init__(
         self._ddp_kwargs = kwargs
         self.dist = LightningDistributed()
         self.num_processes = len(parallel_devices)
+        self.node_rank = 0
         self.mp_queue = None
 
     @property
@@ -579,8 +552,8 @@ def setup(self, model):
 
     def set_world_ranks(self, process_idx):
         self.local_rank = process_idx
-        # check from where we get node_rank, num_processes and num_nodes
-        self.global_rank = self.determine_node_rank() * self.num_processes + process_idx
+        self.node_rank = self.cluster_environment.node_rank()
+        self.global_rank = self.node_rank * self.num_processes + self.local_rank
         self.world_size = self.num_nodes * self.num_processes
 
     def start_training(self, trainer):
@@ -704,18 +677,6 @@ def __recover_child_process_weights(self, best_path, last_path):
             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
             self.lightning_module.load_state_dict(ckpt)
 
-    def determine_local_rank(self):
-        if self.is_slurm_managing_tasks:
-            return int(os.environ['SLURM_LOCALID'])
-        else:
-            return super().determine_node_rank()
-
-    def determine_node_rank(self):
-        if self.is_slurm_managing_tasks:
-            return int(os.environ['SLURM_NODEID'])
-        else:
-            return super().determine_node_rank()
-
     def barrier(self, *args, **kwargs):
         if torch_distrib.is_initialized():
             torch_distrib.barrier()
diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py
index 5196e44411082..6de290cd63ee9 100644
--- a/pytorch_lightning/cluster_environments/cluster_environment.py
+++ b/pytorch_lightning/cluster_environments/cluster_environment.py
@@ -26,8 +26,11 @@ def master_address(self):
     def master_port(self):
         pass
 
-    def world_size(self):
+    def world_size(self) -> int:
         return self._world_size
 
-    def local_rank(self):
+    def local_rank(self) -> int:
+        pass
+
+    def node_rank(self) -> int:
         pass
diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py
index 50da4bc42d5dc..9710d654dff0d 100644
--- a/pytorch_lightning/cluster_environments/slurm_environment.py
+++ b/pytorch_lightning/cluster_environments/slurm_environment.py
@@ -70,6 +70,9 @@ def world_size(self):
     def local_rank(self):
         return int(os.environ['SLURM_LOCALID'])
 
+    def node_rank(self):
+        return int(os.environ['SLURM_NODEID'])
+
     def resolve_root_node_address(self, root_node):
         if '[' in root_node:
             name, numbers = root_node.split('[', maxsplit=1)
diff --git a/pytorch_lightning/cluster_environments/torchelastic_environment.py b/pytorch_lightning/cluster_environments/torchelastic_environment.py
index 5c14ea49b4cd0..89fd4ebb2cee0 100644
--- a/pytorch_lightning/cluster_environments/torchelastic_environment.py
+++ b/pytorch_lightning/cluster_environments/torchelastic_environment.py
@@ -16,7 +16,7 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_warn, rank_zero_info
 
 
 class TorchElasticEnvironment(ClusterEnvironment):
@@ -50,3 +50,18 @@ def world_size(self):
 
     def local_rank(self):
         return int(os.environ['LOCAL_RANK'])
+
+    def node_rank(self):
+        # TODO: use GROUP_RANK and provide a default environment class that uses NODE_RANK
+        # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK.
+        # otherwise use given node rank or default to node rank 0
+        env_vars = ['NODE_RANK', 'GROUP_RANK']
+        node_ids = [(k, os.environ.get(k, None)) for k in env_vars]
+        node_ids = [(k, v) for k, v in node_ids if v is not None]
+        if len(node_ids) == 0:
+            return 0
+        if len(node_ids) > 1:
+            log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.")
+        k, rank = node_ids.pop()
+        rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
+        return int(rank)
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index f7daa1c44708c..1982154b1ecf9 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -99,6 +99,11 @@ def local_rank(self):
         # some training types define a local rank
         return getattr(self.accelerator.training_type_plugin, "local_rank", 0)
 
+    @property
+    def node_rank(self):
+        # some training types define a local rank
+        return getattr(self.accelerator.training_type_plugin, "node_rank", 0)
+
     @property
     def world_size(self):
         # some training types define a world size
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 9817e3c85a7e0..45a5c177d58fa 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 import pickle
 from argparse import ArgumentParser
+from unittest import mock
+from unittest.mock import MagicMock, PropertyMock
 from typing import Any, Dict
-from unittest.mock import MagicMock
 
 import pytest
 import torch
@@ -26,7 +27,9 @@
 from tests.base.develop_utils import reset_seed
 
 
-def test_can_prepare_data(tmpdir):
+@mock.patch("pytorch_lightning.trainer.trainer.Trainer.node_rank", new_callable=PropertyMock)
+@mock.patch("pytorch_lightning.trainer.trainer.Trainer.local_rank", new_callable=PropertyMock)
+def test_can_prepare_data(local_rank, node_rank):
 
     dm = BoringDataModule()
     trainer = Trainer()
@@ -36,33 +39,36 @@ def test_can_prepare_data(tmpdir):
     # prepare_data_per_node = True
     # local rank = 0   (True)
     trainer.prepare_data_per_node = True
-    trainer.local_rank = 0
+
+    local_rank.return_value = 0
+    assert trainer.local_rank == 0
     assert trainer.data_connector.can_prepare_data()
 
     # local rank = 1   (False)
-    trainer.local_rank = 1
+    local_rank.return_value = 1
+    assert trainer.local_rank == 1
     assert not trainer.data_connector.can_prepare_data()
 
     # prepare_data_per_node = False (prepare across all nodes)
     # global rank = 0   (True)
     trainer.prepare_data_per_node = False
-    trainer.node_rank = 0
-    trainer.local_rank = 0
+    node_rank.return_value = 0
+    local_rank.return_value = 0
     assert trainer.data_connector.can_prepare_data()
 
     # global rank = 1   (False)
-    trainer.node_rank = 1
-    trainer.local_rank = 0
+    node_rank.return_value = 1
+    local_rank.return_value = 0
     assert not trainer.data_connector.can_prepare_data()
-    trainer.node_rank = 0
-    trainer.local_rank = 1
+    node_rank.return_value = 0
+    local_rank.return_value = 1
     assert not trainer.data_connector.can_prepare_data()
 
     # 2 dm
     # prepar per node = True
     # local rank = 0 (True)
     trainer.prepare_data_per_node = True
-    trainer.local_rank = 0
+    local_rank.return_value = 0
 
     # is_overridden prepare data = True
     # has been called

From 5319b0fefc916f82f6232339829c044e7d72ecec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 11:40:27 +0100
Subject: [PATCH 080/157] fix retrieving last path when testing

---
 pytorch_lightning/accelerators/data_parallel.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 86ce580fdff79..a71051b5792b5 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -672,8 +672,7 @@ def __recover_child_process_weights(self, best_path, last_path):
         # todo, pass also best score
 
         # load last weights
-        # TODO: How to get self.trainer.testing?
-        if last_path is not None: # and not self.trainer.testing:
+        if last_path is not None and not self.lightning_module.trainer.testing:
             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
             self.lightning_module.load_state_dict(ckpt)
 

From 3b54cfb2128a1b122b038fbf21b2da516c8ae3b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 11:41:52 +0100
Subject: [PATCH 081/157] remove obsolete plugin argument

---
 pytorch_lightning/accelerators/accelerator_connector.py | 1 -
 pytorch_lightning/accelerators/data_parallel.py         | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index e89654416bbbe..224eed99b8863 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -231,7 +231,6 @@ def select_training_type_plugin(self):
                 parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
                 cluster_environment=cluster_environment,
-                is_slurm_managing_tasks=self.is_slurm_managing_tasks,
                 sync_batchnorm=self.sync_batchnorm,
             )
         elif self.use_dp:
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index a71051b5792b5..7ec9f3b82f0cf 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -283,14 +283,12 @@ def __init__(
             parallel_devices,
             num_nodes=1,
             cluster_environment=None,
-            is_slurm_managing_tasks=False,
             sync_batchnorm=False,
             **kwargs: Dict[str, Any],
     ) -> None:
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.interactive_ddp_procs = []
         self.num_nodes = num_nodes
-        self.is_slurm_managing_tasks = is_slurm_managing_tasks
         self.sync_batchnorm = sync_batchnorm
         self.dist = LightningDistributed()
         self._ddp_kwargs = kwargs
@@ -510,13 +508,11 @@ def __init__(
         parallel_devices,
         num_nodes=1,
         cluster_environment=None,
-        is_slurm_managing_tasks=False,
         sync_batchnorm=False,
         **kwargs: Dict[str, Any]
     ):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.num_nodes = num_nodes
-        self.is_slurm_managing_tasks = is_slurm_managing_tasks
         self.sync_batchnorm = sync_batchnorm
         self._ddp_kwargs = kwargs
         self.dist = LightningDistributed()

From 6540b8785f530ef728c99181526e1dc9b99ef6fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 12:04:48 +0100
Subject: [PATCH 082/157] fix test: test_trainer_config

---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 224eed99b8863..181783d268f2f 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -148,7 +148,8 @@ def tpu_id(self):
 
     @property
     def on_gpu(self):
-        return self.parallel_device_ids and torch.cuda.is_available()
+        gpus = self.parallel_device_ids
+        return gpus is not None and len(gpus) > 0 and torch.cuda.is_available()
 
     @property
     def num_gpus(self) -> int:
@@ -335,6 +336,7 @@ def set_distributed_mode(self):
                 rank_zero_warn(
                     "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs."
                 )
+            self.parallel_device_ids = None
             self.use_ddp = True
 
         # HOROVOD

From 6b450e165485f735b46d2a050eefaeb2ff9de7a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 15:34:51 +0100
Subject: [PATCH 083/157] fix torchscript tests

---
 pytorch_lightning/core/lightning.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 33d206b6bc49d..7d4fa62286062 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -66,6 +66,8 @@ class LightningModule(
         "on_gpu",
         "current_epoch",
         "global_step",
+        "global_rank",
+        "local_rank",
     ] + DeviceDtypeModuleMixin.__jit_unused_properties__
 
     def __init__(self, *args, **kwargs):

From 4ef539f2b7b87aa716daf71586da09d4fb9511e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 16:11:56 +0100
Subject: [PATCH 084/157] fix trainer.model access

---
 pytorch_lightning/trainer/properties.py |  9 +++++++++
 pytorch_lightning/trainer/trainer.py    |  9 +--------
 tests/base/develop_pipelines.py         | 11 ++---------
 3 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 1982154b1ecf9..8c4a64d128635 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -357,6 +357,15 @@ def checkpoint_callbacks(self) -> List[ModelCheckpoint]:
     def save_checkpoint(self, filepath, weights_only: bool = False):
         self.checkpoint_connector.save_checkpoint(filepath, weights_only)
 
+    @property
+    def model(self):
+        """
+        The LightningModule, but possibly wrapped into DataParallel or DistributedDataParallel.
+        To access the pure LightningModule, use
+        :meth:`~pytorch_lightning.trainer.trainer.Trainer.lightning_module` instead.
+        """
+        return self.accelerator.model
+
     def get_model(self):
         # TODO: rename this to lightning_module (see training type plugin)
         # backward compatible
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a0d62d2a1104d..5ed45df5eaf8b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -353,7 +353,7 @@ def __init__(
         self.train_loop = TrainLoop(self, multiple_trainloader_mode)
 
         # training state
-        self.model = None
+        self.weights_summary = weights_summary
         self.shown_warnings = set()
 
         # init callbacks
@@ -591,11 +591,6 @@ def pre_training_routine(self):
             else:
                 raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES))
 
-        # TODO: what the heck is this
-        # track model now.
-        # if cluster resets state, the model will update with the saved weights
-        # self.trainer.model = model
-
         # restore training and model before hpc is called
         self.checkpoint_connector.restore_weights(ref_model)
 
@@ -920,7 +915,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
         self.tested_ckpt_path = ckpt_path
         self.testing = True
         os.environ["PL_TESTING_MODE"] = "1"
-        self.model = model
         results = self.fit(model)
         self.testing = False
         del os.environ["PL_TESTING_MODE"]
@@ -941,7 +935,6 @@ def __test_given_model(self, model, test_dataloaders):
         # run test
         # sets up testing so we short circuit to eval
         self.testing = True
-        self.model = model
         results = self.fit(model)
         self.testing = False
 
diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py
index 4949d53fc9a50..71747c21bf989 100644
--- a/tests/base/develop_pipelines.py
+++ b/tests/base/develop_pipelines.py
@@ -44,11 +44,6 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50
     for dataloader in test_loaders:
         run_prediction(pretrained_model, dataloader, min_acc=min_acc)
 
-    if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN):
-        # on hpc this would work fine... but need to hack it for the purpose of the test
-        trainer.model = pretrained_model
-        trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers()
-
 
 def run_model_test(trainer_options, model, on_gpu: bool = True, version=None,
                    with_hpc: bool = True, min_acc: float = 0.25):
@@ -84,10 +79,8 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None,
     if with_hpc:
         if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2):
             # on hpc this would work fine... but need to hack it for the purpose of the test
-            trainer.model = pretrained_model
-            trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers(
-                pretrained_model
-            )
+            trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \
+                trainer.init_optimizers(pretrained_model)
 
         # test HPC saving
         trainer.checkpoint_connector.hpc_save(save_dir, logger)

From 1001ccfa581d5301cb9199fe4294d3248581e335 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 16:23:59 +0100
Subject: [PATCH 085/157] move properties

---
 pytorch_lightning/trainer/properties.py | 32 +++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 8c4a64d128635..62241722ff365 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -379,6 +379,38 @@ def lightning_module(self):
     def optimizers(self):
         return self.accelerator.optimizers
 
+    @optimizers.setter
+    def optimizers(self, new_optims):
+        self.accelerator.optimizers = new_optims
+
+    @property
+    def lr_schedulers(self):
+        return self.accelerator.lr_schedulers
+
+    @lr_schedulers.setter
+    def lr_schedulers(self, new_schedulers):
+        self.accelerator.lr_schedulers = new_schedulers
+
+    @property
+    def optimizer_frequencies(self):
+        return self.accelerator.optimizer_frequencies
+
+    @optimizer_frequencies.setter
+    def optimizer_frequencies(self, new_freqs):
+        self.accelerator.optimizer_frequencies = new_freqs
+
+    @property
+    def amp_backend(self):
+        return self.accelerator.amp_backend
+
+    @property
+    def precision(self):
+        return self.accelerator.precision
+
+    @property
+    def scaler(self):
+        return self.accelerator.scaler
+
     # TODO: refactor this so that it can be done in LightningOptimizer
     def __getstate__(self):
         # unwrap optimizer

From 38a1d0fc3bde969b5f4b18c589cfae7e91396dc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 16:44:58 +0100
Subject: [PATCH 086/157] fix test_transfer_batch_hook

---
 tests/models/test_hooks.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 72f0790ca3df3..b2491389135f2 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+from unittest import mock
 from unittest.mock import MagicMock
 
 import pytest
 import torch
+from unittest.mock import PropertyMock
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.trainer.states import TrainerState
@@ -90,7 +92,8 @@ def training_epoch_end(self, outputs):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_transfer_batch_hook():
+@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock)
+def test_transfer_batch_hook(model_getter_mock):
 
     class CustomBatch:
 
@@ -116,7 +119,7 @@ def transfer_batch_to_device(self, data, device):
 
     trainer = Trainer(gpus=1)
     # running .fit() would require us to implement custom data loaders, we mock the model reference instead
-    trainer.get_model = MagicMock(return_value=model)
+    model_getter_mock.return_value = model
     batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0'))
     expected = torch.device('cuda', 0)
     assert model.hook_called

From 46cf7effbf13980d8f3886945c53940d414da676 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 17:16:20 +0100
Subject: [PATCH 087/157] fix auto_select_gpus

---
 pytorch_lightning/accelerators/accelerator_connector.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 181783d268f2f..efce11ab4bae6 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -20,6 +20,7 @@
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
     DataParallelPlugin, DDP2Plugin
 from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
+from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
@@ -91,18 +92,16 @@ def __init__(
         if "LOCAL_RANK" in os.environ:
             rank_zero_only.rank = int(os.environ["LOCAL_RANK"])
 
-        # TODO: Move autoselect GPUS to other place
         # for gpus allow int, string and gpu list
-        # if auto_select_gpus and isinstance(gpus, int):
-        #     self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus)
+        if auto_select_gpus and isinstance(gpus, int):
+            self.gpus = pick_multiple_gpus(gpus)
+
         self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)
         self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_device_ids)
-        # self.root_device = torch.device("cpu")
 
         self.set_distributed_mode()
         self.configure_slurm_ddp()
 
-        # todo: select accelerator based on trainer flags
         self.accelerator = self.select_accelerator()
 
         # override dist backend when using tpus

From 258f50e275b904ac755530f942c7ff6fb379cbb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 17:38:48 +0100
Subject: [PATCH 088/157] fix omegaconf test

---
 pytorch_lightning/utilities/device_parser.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py
index 9417bc13e8e8b..ce81ef0222b9e 100644
--- a/pytorch_lightning/utilities/device_parser.py
+++ b/pytorch_lightning/utilities/device_parser.py
@@ -14,6 +14,7 @@
 from typing import Any, List, MutableSequence, Optional, Tuple, Union
 
 import torch
+from typing import Union, Any, List, Optional, Tuple, MutableSequence
 
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -148,7 +149,7 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]:
 
 def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int], Tuple[int, ...]]) -> Optional[List[int]]:
     assert gpus is not None
-    if isinstance(gpus, (list, tuple)):
+    if isinstance(gpus, (MutableSequence, tuple)):
         return list(gpus)
 
     # must be an int
@@ -177,7 +178,7 @@ def _check_data_type(device_ids: Any) -> None:
         device_ids: gpus/tpu_cores parameter as passed to the Trainer
     """
     if device_ids is not None and \
-            (not isinstance(device_ids, (int, str, list, tuple)) or isinstance(device_ids, bool)):
+            (not isinstance(device_ids, (int, str, MutableSequence, tuple)) or isinstance(device_ids, bool)):
         raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.")
 
 
From a5d69b9a20fc2656eb24cf3f66d9ab747b13e63f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 27 Dec 2020 18:10:27 +0100
Subject: [PATCH 089/157] fix test that needs to simulate slurm ddp

---
 tests/models/test_amp.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index ed2aa1ac99031..d80077f3855b9 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -109,11 +109,17 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@mock.patch.dict(os.environ, {
+    "SLURM_NTASKS": "1",
+    "SLURM_JOB_NAME": "SOME_NAME",
+    "SLURM_NODEID": "0",
+    "LOCAL_RANK": "0",
+    "SLURM_LOCALID": "0"
+})
 def test_amp_gpu_ddp_slurm_managed(tmpdir):
     """Make sure DDP + AMP work."""
     # simulate setting slurm flags
     tutils.set_random_master_port()
-    os.environ['SLURM_LOCALID'] = str(0)
 
     model = EvalModelTemplate()
 
@@ -133,18 +139,17 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
         callbacks=[checkpoint],
         logger=logger,
     )
-    trainer.is_slurm_managing_tasks = True
-    trainer.fit(model)
+    result = trainer.fit(model)
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, 'amp + ddp model failed to complete'
 
     # test root model address
-    assert isinstance(trainer.accelerator_connector.cluster_environment, SLURMEnvironment)
-    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc') == 'abc'
-    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23'
-    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23'
-    assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23'
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc') == 'abc'
+    assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23'
+    assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23'
+    assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23'
 
 
 @pytest.mark.parametrize("enable_pl_optimizer", [False, True])

From 88a7ed5d31f5f1e5a5a1e1c3edbb3e151aac5a0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 29 Dec 2020 21:14:10 +0100
Subject: [PATCH 090/157] add horovod plugin

---
 pytorch_lightning/accelerators/accelerator.py |  13 +-
 .../accelerators/accelerator_connector.py     |   7 +-
 .../accelerators/data_parallel.py             | 160 +++++++++++++++++-
 tests/models/test_horovod.py                  |   2 +-
 4 files changed, 170 insertions(+), 12 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 8c1bfdc9301cb..465ed3dd237e5 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,4 +1,4 @@
-from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin
+from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin, HorovodPlugin
 from pytorch_lightning.accelerators.base_plugin import Plugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE, AMPType
@@ -106,12 +106,17 @@ def process_dataloader(self, dataloader):
         return dataloader
 
     def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs):
-        return self.precision_plugin.backward(
+        output = self.precision_plugin.backward(
             self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
         )
 
-    def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
+        # TODO: this is a hack, find a better solution for this (hook?)
+        if isinstance(self.training_type_plugin, HorovodPlugin):
+            optimizer.synchronize()
+
+        return output
 
+    def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure):
         model_ref = self.lightning_module
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
         native_amp = (
@@ -119,6 +124,7 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl
         )
 
         self.precision_plugin.pre_optimizer_step(optimizer, opt_idx)
+        self.training_type_plugin.pre_optimizer_step(optimizer, opt_idx)
 
         # model hook
         res = model_ref.optimizer_step(
@@ -133,6 +139,7 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl
         )
 
         self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
+        self.training_type_plugin.post_optimizer_step(optimizer, opt_idx)
         return res
 
     def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx):
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index efce11ab4bae6..825ea25a354fa 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -18,7 +18,7 @@
 
 from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
-    DataParallelPlugin, DDP2Plugin
+    DataParallelPlugin, DDP2Plugin, HorovodPlugin
 from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser
@@ -236,7 +236,7 @@ def select_training_type_plugin(self):
         elif self.use_dp:
             plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
         elif self.use_horovod:
-            raise NotImplementedError
+            plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
         else:
             plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
         return plugin
@@ -364,7 +364,10 @@ def _set_horovod_backend(self):
         hvd.init()
         if self.on_gpu:
             # Horovod assigns one local GPU per process
+            self.parallel_device_ids = list(range(hvd.local_size()))
             self.root_gpu = hvd.local_rank()
+        else:
+            self.num_processes = hvd.local_size()
 
     def check_horovod(self):
         """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod."""
diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
index 7ec9f3b82f0cf..02a748222732e 100644
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ b/pytorch_lightning/accelerators/data_parallel.py
@@ -1,8 +1,12 @@
 from abc import ABC, abstractmethod
 import re
-from contextlib import contextmanager
+from contextlib import contextmanager, ExitStack
 
-from pytorch_lightning.cluster_environments import TorchElasticEnvironment
+from torch.optim.lr_scheduler import _LRScheduler
+
+from pytorch_lightning.cluster_environments import TorchElasticEnvironment, ClusterEnvironment
+from pytorch_lightning.core.optimizer import LightningOptimizer
+from pytorch_lightning.utilities import HOROVOD_AVAILABLE
 from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
 from pytorch_lightning.accelerators.base_plugin import Plugin
 
@@ -26,6 +30,9 @@
 import torch.multiprocessing as mp
 from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info
 
+if HOROVOD_AVAILABLE:
+    import horovod.torch as hvd
+
 try:
     from hydra.utils import to_absolute_path, get_original_cwd
     from hydra.core.hydra_config import HydraConfig
@@ -166,7 +173,11 @@ def broadcast(self, obj: object, src: int = 0) -> object:
 
 
 class ParallelPlugin(TrainingTypePlugin, ABC):
-    def __init__(self, parallel_devices: List[torch.device], cluster_environment=None):
+    def __init__(
+        self,
+        parallel_devices: List[torch.device],
+        cluster_environment: Optional[ClusterEnvironment] = None,
+    ):
         super().__init__()
         self.parallel_devices = parallel_devices
         self.local_rank = 0
@@ -240,6 +251,9 @@ def block_backward_sync(self):
 
 class DataParallelPlugin(ParallelPlugin):
 
+    def __init__(self, parallel_devices: List[torch.device]):
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
+
     def setup(self, model):
         self._model = LightningDataParallel(model, self.parallel_devices)
 
@@ -282,7 +296,7 @@ def __init__(
             self,
             parallel_devices,
             num_nodes=1,
-            cluster_environment=None,
+            cluster_environment: ClusterEnvironment = None,
             sync_batchnorm=False,
             **kwargs: Dict[str, Any],
     ) -> None:
@@ -507,7 +521,7 @@ def __init__(
         self,
         parallel_devices,
         num_nodes=1,
-        cluster_environment=None,
+        cluster_environment: ClusterEnvironment = None,
         sync_batchnorm=False,
         **kwargs: Dict[str, Any]
     ):
@@ -690,6 +704,140 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
         return output
 
 
-# TODO: DDP2 (?), HOROVOD DDP AND HPC DDP
+# TODO: DDP2
 class DDP2Plugin(DDPPlugin):
     pass
+
+
+class HorovodPlugin(ParallelPlugin):
+
+    def __init__(self, parallel_devices: List[torch.device]):
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
+
+    @property
+    def root_device(self):
+        return self.parallel_devices[self.local_rank]
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(
+            num_replicas=hvd.size(),
+            rank=hvd.rank()
+        )
+        return distributed_sampler_kwargs
+
+    def setup(self, model):
+        self._model = model
+
+        self.global_rank = hvd.rank()
+        self.local_rank = hvd.local_rank()
+        rank_zero_only.rank = self.global_rank
+
+        self.model_to_device()
+
+    def pre_training(self):
+
+        def _unpack_lightning_optimizer(opt):
+            return opt._optimizer if isinstance(opt, LightningOptimizer) else opt
+
+        optimizers = self.lightning_module.trainer.optimizers
+        optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers]
+
+        # Horovod: scale the learning rate by the number of workers to account for
+        # increased total batch size
+        for optimizer in optimizers:
+            for param_group in optimizer.param_groups:
+                param_group['lr'] *= hvd.size()
+
+        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
+        lr_schedulers = self.lightning_module.trainer.lr_schedulers
+        for scheduler in lr_schedulers:
+            scheduler = scheduler['scheduler']
+            if isinstance(scheduler, _LRScheduler):
+                scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs]
+
+        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
+        hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0)
+        for optimizer in optimizers:
+            hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+
+        def _filter_named_parameters(model, optimizer):
+            opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])])
+            return [(name, p) for name, p in model.named_parameters() if p in opt_params]
+
+        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
+        optimizers = [
+            hvd.DistributedOptimizer(optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer))
+            for optimizer in optimizers
+        ]
+
+        optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers)
+        self.lightning_module.trainer.optimizers = optimizers
+
+    def start_training(self, trainer):
+        with ExitStack() as stack:
+            for optimizer in trainer.optimizers:
+                # Synchronization will be performed explicitly following backward()
+                stack.enter_context(optimizer.skip_synchronize())
+
+            # set up training routine
+            self._results = trainer.train()
+
+        # Make sure all workers have finished training before returning to the user
+        hvd.join()
+
+    def start_testing(self, trainer):
+        with ExitStack() as stack:
+            # set up training routine
+            # self.trainer.train_loop.setup_training(self.trainer.model)
+            self._results = trainer.run_test()
+
+        # Make sure all workers have finished training before returning to the user
+        hvd.join()
+
+    def barrier(self, *args, **kwargs):
+        hvd.join()
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        obj = hvd.broadcast_object(obj, src)
+        return obj
+
+    def model_to_device(self):
+        if self.on_gpu:
+            torch.cuda.set_device(self.root_device)
+        self.model.to(self.root_device)
+
+    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
+        if group is not None:
+            raise ValueError(
+                "Horovod does not support allreduce using a subcommunicator at this time. "
+                "Unset `group`."
+            )
+
+        if reduce_op is None or reduce_op == "sum":
+            reduce_op = hvd.Sum
+        elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
+            reduce_op = hvd.Average
+        else:
+            raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")
+
+        # sync all processes before reduction
+        hvd.join()
+        return hvd.allreduce(output, op=reduce_op)
+
+    def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None):
+        if group is not None:
+            raise ValueError(
+                "Horovod does not support allgather using a subcommunicator at this time. "
+                "Unset `group`."
+            )
+
+        if len(result.shape) == 0:
+            # Convert scalars to single dimension tensors
+            result = result.reshape(1)
+
+        # sync and gather all
+        hvd.join()
+        gathered = hvd.allgather(result)
+        gathered_result = list(gathered.split(1, dim=0))
+        return gathered_result
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 6b2eaef1f1da8..623f329035533 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -317,7 +317,7 @@ def _compute_batch():
 
         metric = Accuracy(compute_on_step=True,
                           dist_sync_on_step=True,
-                          dist_sync_fn=trainer.accelerator_backend.gather_all_tensors,
+                          dist_sync_fn=trainer.training_type_plugin.gather_all_tensors,
                           threshold=threshold)
 
         for i in range(hvd.rank(), num_batches, hvd.size()):

From 40daa41def2f77a5760470b0fed813397e58e629 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 29 Dec 2020 21:33:32 +0100
Subject: [PATCH 091/157] fix test with named arguments

---
 tests/core/test_lightning_module.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index 9d45310a1de54..f2936c7f19d55 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -117,15 +117,15 @@ def configure_optimizers(self):
             optimizer_2 = Adam(self.layer.parameters(), lr=0.1)
             return [optimizer, optimizer_2]
 
-        def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure,
-                           on_tpu=False, using_native_amp=False, using_lbfgs=False):
-            # warm up lr
-            if self.trainer.global_step < 500:
-                lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
-                for pg in optimizer.param_groups:
-                    pg['lr'] = lr_scale * 0.01
-
-            optimizer.step(closure=closure)
+            def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
+                               on_tpu=False, using_native_amp=False, using_lbfgs=False):
+                # warm up lr
+                if self.trainer.global_step < 500:
+                    lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
+                    for pg in optimizer.param_groups:
+                        pg['lr'] = lr_scale * 0.01
+
+                optimizer.step(closure=optimizer_closure)
 
     model = TestModel()
     model.training_epoch_end = None

From 96fc074d017c478aa5c578e0da70464d6dc9c683 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 30 Dec 2020 00:12:23 +0100
Subject: [PATCH 092/157] clean up whitespace

---
 pytorch_lightning/accelerators/accelerator.py       | 13 ++++++-------
 .../accelerators/accelerator_connector.py           |  2 --
 pytorch_lightning/trainer/trainer.py                |  9 ---------
 3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 465ed3dd237e5..07777d982b2d6 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,8 +1,10 @@
-from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin, HorovodPlugin
-from pytorch_lightning.accelerators.base_plugin import Plugin
+import os
+
+from pytorch_lightning import _logger as log
+from pytorch_lightning.accelerators.data_parallel import TrainingTypePlugin, HorovodPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE, AMPType
-from typing import Any, Union
+from pytorch_lightning.utilities import AMPType
+from typing import Any
 import math
 
 import torch
@@ -159,8 +161,6 @@ def _clip_gradients(self, optimizer, grad_clip_val):
         if grad_clip_val <= 0:
             return
 
-        model = self.lightning_module
-
         # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX
 
         # if self.trainer.amp_backend == AMPType.APEX:
@@ -215,7 +215,6 @@ def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: Lightn
 
     def connect_precision_plugin(self, plugin: PrecisionPlugin):
         model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers)
-
         self.model = model
         self.optimizers = optimizers
         self.schedulers = schedulers
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 825ea25a354fa..7addf4bdd72c2 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
 
 import os
 import torch
@@ -272,7 +271,6 @@ def select_cluster_environment(self):
         return env
 
     def set_distributed_mode(self):
-
         # No distributed backend
         if self.distributed_backend is None:
             # horovod multi GPU
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 5ed45df5eaf8b..382b6e3c5ae8e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -520,14 +520,6 @@ def fit(
         self.accelerator_backend.setup(self, model)
         self.train_loop.setup_training(model)
 
-        # ----------------------------
-        # INSPECT THESE FOR MAIN LOOPS
-        # ----------------------------
-        # assign training and eval functions... inspect these to see the train and eval loops :)
-        # self.accelerator_backend.train_loop = self.train
-        # self.accelerator_backend.validation_loop = self.run_evaluation
-        # self.accelerator_backend.test_loop = self.run_evaluation
-
         # ----------------------------
         # TRAIN
         # ----------------------------
@@ -562,7 +554,6 @@ def fit(
 
         # return 1 when finished
         # used for testing or when we need to know that training succeeded
-
         if self._state != TrainerState.INTERRUPTED:
             self._state = TrainerState.FINISHED
         return results or 1

From 210831ab6bd86d661d16e296a3ee107dcd0c9b24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 30 Dec 2020 00:21:16 +0100
Subject: [PATCH 093/157] fix datamodules test

---
 tests/core/test_datamodules.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 45a5c177d58fa..7796c9c074d6e 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -23,6 +23,7 @@
 from pytorch_lightning import LightningDataModule, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
+from pytorch_lightning.utilities.model_helpers import is_overridden
 from tests.base import BoringDataModule, BoringModel
 from tests.base.develop_utils import reset_seed
 
@@ -397,7 +398,8 @@ def test_full_loop_dp(tmpdir):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_dm_transfer_batch_to_device(tmpdir):
+@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock)
+def test_dm_transfer_batch_to_device(get_module_mock):
     class CustomBatch:
         def __init__(self, data):
             self.samples = data[0]
@@ -420,9 +422,9 @@ def transfer_batch_to_device(self, data, device):
 
     trainer = Trainer(gpus=1)
     # running .fit() would require us to implement custom data loaders, we mock the model reference instead
-    trainer.get_model = MagicMock(return_value=model)
-
-    model.transfer_batch_to_device = dm.transfer_batch_to_device
+    get_module_mock.return_value = model
+    if is_overridden('transfer_batch_to_device', dm):
+        model.transfer_batch_to_device = dm.transfer_batch_to_device
 
     batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0'))
     expected = torch.device('cuda', 0)

From 98b6dd4569806a2fd45462888da795813d51f3fc Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 6 Jan 2021 17:31:11 +0100
Subject: [PATCH 094/157] remove old accelerators

---
 .../accelerators/old/accelerator.py           | 259 ---------------
 .../accelerators/old/ddp2_accelerator.py      | 268 ----------------
 .../old/ddp_cpu_hpc_accelerator.py            |  48 ---
 .../old/ddp_cpu_spawn_accelerator.py          | 297 ------------------
 .../accelerators/old/dp_accelerator.py        | 189 -----------
 .../accelerators/old/gpu_accelerator.py       | 108 -------
 6 files changed, 1169 deletions(-)
 delete mode 100644 pytorch_lightning/accelerators/old/accelerator.py
 delete mode 100644 pytorch_lightning/accelerators/old/ddp2_accelerator.py
 delete mode 100644 pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py
 delete mode 100644 pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py
 delete mode 100644 pytorch_lightning/accelerators/old/dp_accelerator.py
 delete mode 100644 pytorch_lightning/accelerators/old/gpu_accelerator.py

diff --git a/pytorch_lightning/accelerators/old/accelerator.py b/pytorch_lightning/accelerators/old/accelerator.py
deleted file mode 100644
index b16e0125054bb..0000000000000
--- a/pytorch_lightning/accelerators/old/accelerator.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import math
-from enum import Enum
-from pytorch_lightning.core.lightning import LightningModule
-from typing import Any, Optional, Union
-
-import torch
-
-from pytorch_lightning.utilities import AMPType, rank_zero_warn
-from pytorch_lightning.utilities.apply_func import move_data_to_device
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.parsing import AttributeDict
-import torch.distributed as torch_distrib
-from pytorch_lightning import _logger as log
-
-try:
-    from apex import amp
-except ImportError:
-    amp = None
-
-if torch.distributed.is_available():
-    from torch.distributed import ReduceOp
-else:
-
-    class ReduceOp:
-        SUM = None
-
-
-EPSILON = 1e-6
-EPSILON_FP16 = 1e-5
-
-
-class Accelerator(object):
-    def __init__(self, trainer=None, cluster_environment=None, ddp_plugin=None):
-        self.trainer = trainer
-        self.nickname = None
-        self.cluster_environment = cluster_environment
-        self.dist = AttributeDict(rank=0, device=None)
-        self.ddp_plugin = ddp_plugin
-
-        if trainer is not None:
-            self.train_loop = self.trainer.train
-            self.validation_loop = self.trainer.run_evaluation
-            self.test_loop = self.trainer.run_evaluation
-
-    def setup(self, model):
-        pass
-
-    def teardown(self):
-        # Ensure if necessary all processes are finished
-        self.barrier()
-
-    def barrier(self, name: Optional[str] = None):
-        pass
-
-    def broadcast(self, obj, src=0):
-        return obj
-
-    def train_or_test(self):
-        if self.trainer.testing:
-            results = self.trainer.run_test()
-        else:
-            results = self.trainer.train()
-        return results
-
-    def batch_to_device(self, batch: Any, device: torch.device):
-        model = self.trainer.get_model()
-        if model is not None:
-            return model.transfer_batch_to_device(batch, device)
-        return move_data_to_device(batch, device)
-
-    def training_step_end(self, output):
-        return output
-
-    def test_step_end(self, output):
-        return output
-
-    def validation_step_end(self, output):
-        return output
-
-    def process_dataloader(self, dataloader):
-        return dataloader
-
-    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
-        if self.trainer.precision == 16:
-            closure_loss = self.trainer.precision_connector.backend.backward(
-                closure_loss, optimizer, opt_idx, *args, **kwargs
-            )
-        else:
-            # do backward pass
-            model = self.trainer.get_model()
-            model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
-
-            # once backward has been applied, release graph
-            closure_loss = closure_loss.detach()
-        return closure_loss
-
-    def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
-        model_ref = self.trainer.get_model()
-        is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
-        native_amp = self.trainer.amp_backend == AMPType.NATIVE
-
-        # native amp + lbfgs is a no go right now
-        if native_amp and is_lbfgs:
-            raise MisconfigurationException(
-                "native PyTorch amp and lbfgs are not compatible."
-                " To request, please file a Github issue in PyTorch and tag @mcarilli"
-            )
-
-        # model hook
-        model_ref.optimizer_step(
-            epoch=self.trainer.current_epoch,
-            batch_idx=batch_idx,
-            optimizer=optimizer,
-            optimizer_idx=opt_idx,
-            optimizer_closure=lambda_closure,
-            on_tpu=False,  # TPUAccelerator class sets this as True
-            using_native_amp=native_amp,
-            using_lbfgs=is_lbfgs,
-        )
-
-        # scale when native amp
-        if native_amp:
-            self.trainer.scaler.update()
-
-    def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
-        model_ref = self.trainer.get_model()
-        model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
-
-    def clip_gradients(self, optimizer, clip_val=None):
-        # TODO: separate TPU case from here
-        self._clip_gradients(optimizer, clip_val)
-
-    def _clip_gradients(self, optimizer, clip_val=None):
-        # use the trainer's clip val if none passed
-        grad_clip_val = self.trainer.gradient_clip_val
-        if clip_val is not None:
-            grad_clip_val = clip_val
-        grad_clip_val = float(grad_clip_val)
-
-        # this code is a modification of torch.nn.utils.clip_grad_norm_
-        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
-        if grad_clip_val <= 0:
-            return
-
-        model = self.trainer.get_model()
-        if self.trainer.amp_backend == AMPType.APEX:
-            parameters = amp.master_params(optimizer)
-        else:
-            parameters = model.parameters()
-
-        max_norm = grad_clip_val
-        norm_type = float(2.0)
-
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        parameters = list(filter(lambda p: p.grad is not None, parameters))
-
-        if norm_type == math.inf:
-            total_norm = max(p.grad.data.abs().max() for p in parameters)
-        else:
-            device = parameters[0].device
-            out = torch.empty(len(parameters), device=device)
-            for i, p in enumerate(parameters):
-                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
-            total_norm = torch.norm(out, norm_type)
-
-        eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON
-        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
-        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
-        for p in parameters:
-            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
-
-    def on_train_epoch_end(self, outputs):
-        pass
-
-    def on_train_end(self):
-        pass
-
-    def early_stopping_should_stop(self, pl_module):
-        return self.trainer.should_stop
-
-    def setup_optimizers(self, model):
-        if self.trainer.testing is True:
-            return
-
-        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
-        self.trainer.optimizers = optimizers
-        self.trainer.lr_schedulers = lr_schedulers
-        self.trainer.optimizer_frequencies = optimizer_frequencies
-
-    def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None:
-        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
-        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
-        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
-
-        if not torch.distributed.is_initialized():
-            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
-
-    def sync_tensor(
-        self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
-    ) -> torch.Tensor:
-        """
-        Function to reduce a tensor from several distributed processes to one aggregated tensor.
-
-        Args:
-            tensor: the tensor to sync and reduce
-            group: the process group to gather results from. Defaults to all processes (world)
-            reduce_op: the reduction operation. Defaults to sum.
-                Can also be a string of 'avg', 'mean' to calculate the mean during reduction.
-
-        Return:
-            reduced value
-        """
-        raise NotImplementedError()
-
-    def __getstate__(self):
-        return {
-            "trainer": self.trainer,
-            "nickname": self.nickname,
-            "cluster_environment": self.cluster_environment,
-            "dist": self.dist,
-            "ddp_plugin": self.ddp_plugin,
-        }
-
-    def __setstate__(self, d):
-        self.trainer = d["trainer"]
-        self.nickname = d["nickname"]
-        self.cluster_environment = d["cluster_environment"]
-        self.dist = d["dist"]
-        self.ddp_plugin = d["ddp_plugin"]
-
-
-# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos...
-class BackendType(Enum):
-    DP = "dp"
-    DDP = "ddp"
-    DDP2 = "ddp2"
-    DDP_SPAWN = "ddp_spawn"
-    # decuple distrib and device
-    DDP_CPU = "ddp_cpu"
-    HOROVOD = "horovod"
-    # this is rather device
-    TPU = "tpu"
diff --git a/pytorch_lightning/accelerators/old/ddp2_accelerator.py b/pytorch_lightning/accelerators/old/ddp2_accelerator.py
deleted file mode 100644
index a5e8d720ce186..0000000000000
--- a/pytorch_lightning/accelerators/old/ddp2_accelerator.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-from typing import Any, List, Optional, Union
-
-import torch
-import torch.distributed as torch_distrib
-from torch.nn.parallel import DistributedDataParallel
-
-from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
-from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.core.step_result import Result
-from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available
-
-
-class DDP2Accelerator(Accelerator):
-
-    def __init__(self,
-                 trainer,
-                 cluster_environment: Optional[ClusterEnvironment] = None,
-                 ddp_plugin: Optional[DDPPlugin] = None):
-        """
-        Runs training using DDP2 strategy on a cluster
-
-        Example::
-
-            # default
-            trainer = Trainer(accelerator=DDP2Accelerator())
-
-        """
-        super().__init__(trainer, cluster_environment, ddp_plugin)
-        self.task_idx = None
-        self.dist = LightningDistributed()
-        self.nickname = 'ddp2'
-
-    def setup(self, model):
-        self.trainer.model = model
-        self.task_idx = self.cluster_environment.local_rank()
-
-    def train(self):
-        model = self.trainer.model
-        return self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model)
-
-    def training_step(self, args):
-        return self._step(args)
-
-    def validation_step(self, args):
-        return self._step(args)
-
-    def test_step(self, args):
-        return self._step(args)
-
-    def _step(self, args):
-        args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args)
-        if self.trainer.amp_backend == AMPType.NATIVE:
-            with torch.cuda.amp.autocast():
-                output = self.trainer.model(*args)
-        else:
-            output = self.trainer.model(*args)
-        return output
-
-    def barrier(self, name: Optional[str] = None):
-        if torch_distrib.is_initialized():
-            torch_distrib.barrier()
-
-    def training_step_end(self, output):
-        if isinstance(output, Result):
-            output.dp_reduce()
-        return output
-
-    def validation_step_end(self, output):
-        if isinstance(output, Result):
-            output.dp_reduce()
-        return output
-
-    def test_step_end(self, output):
-        if isinstance(output, Result):
-            output.dp_reduce()
-        return output
-
-    def set_world_ranks(self, process_idx):
-        # Todo: required argument `process_idx` is not used
-        self.trainer.local_rank = self.trainer.node_rank
-        self.trainer.global_rank = self.trainer.node_rank
-        self.trainer.world_size = self.trainer.num_nodes
-
-    def broadcast(self, obj, src=0):
-        return self.dist.broadcast(obj)
-
-    def init_device(self, process_idx):
-        self.trainer.root_gpu = process_idx
-        torch.cuda.set_device(self.trainer.root_gpu)
-
-    def model_to_device(self, model):
-        model.cuda(self.trainer.root_gpu)
-
-    def get_device_ids(self):
-        device_ids = self.trainer.data_parallel_device_ids
-        return device_ids
-
-    def ddp_train(self, process_idx, mp_queue, model):
-        """
-        Entry point for ddp
-
-        Args:
-            process_idx: current process rank
-            mp_queue: multiprocessing queue
-            model: pointer to current :class:`LightningModule`
-
-        Returns:
-            Dict with evaluation results
-
-        """
-        # Todo: required argument `mp_queue` is not used
-        # show progressbar only on progress_rank 0
-        if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
-            self.trainer.progress_bar_callback.disable()
-
-        # determine which process we are and world size
-        self.set_world_ranks(process_idx)
-
-        # set warning rank
-        rank_zero_only.rank = self.trainer.global_rank
-
-        # Initialize cuda device
-        self.init_device(process_idx)
-
-        # set up server using proc 0's ip address
-        # try to init for 20 times at max in case ports are taken
-        # where to store ip_table
-        model.trainer = self.trainer
-        self.init_ddp_connection(
-            self.trainer.global_rank,
-            self.trainer.world_size,
-            self.trainer.is_slurm_managing_tasks
-        )
-
-        if isinstance(self.ddp_plugin, RPCPlugin):
-            if not self.ddp_plugin.is_main_rpc_process:
-                self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer)
-                self.ddp_plugin.exit_rpc_process()
-                if self.ddp_plugin.return_after_exit_rpc_process:
-                    return
-            else:
-                self.ddp_plugin.on_main_rpc_connection(self.trainer)
-
-        # call setup after the ddp process has connected
-        self.trainer.call_setup_hook(model)
-
-        # on world_size=0 let everyone know training is starting
-        if self.trainer.is_global_zero and not torch.distributed.is_initialized():
-            log.info('-' * 100)
-            log.info(f'distributed_backend={self.trainer.distributed_backend}')
-            log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
-            log.info('-' * 100)
-
-        # call sync_bn before .cuda(), configure_apex and configure_ddp
-        if self.trainer.sync_batchnorm:
-            model = self.configure_sync_batchnorm(model)
-
-        # move the model to the correct device
-        self.model_to_device(model)
-
-        # CHOOSE OPTIMIZER
-        # allow for lr schedulers as well
-        self.setup_optimizers(model)
-
-        self.ddp_plugin.on_after_setup_optimizers(self.trainer)
-
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
-        # 16-bit
-        model = self.trainer.precision_connector.connect(model)
-
-        self.trainer.convert_to_lightning_optimizers()
-
-        # device ids change depending on the DDP setup
-        device_ids = self.get_device_ids()
-
-        # allow user to configure ddp
-        model = self.configure_ddp(model, device_ids)
-
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-
-        # clean up memory
-        torch.cuda.empty_cache()
-        return results
-
-    def configure_ddp(
-            self, model: LightningModule, device_ids: List[int]
-    ) -> DistributedDataParallel:
-        model = self.ddp_plugin.configure_ddp(model, device_ids)
-        return model
-
-    def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
-        """
-        Add global batchnorm for a model spread across multiple GPUs and nodes.
-
-        Override to synchronize batchnorm between specific process groups instead
-        of the whole world or use a different sync_bn like `apex`'s version.
-
-        Args:
-            model: pointer to current :class:`LightningModule`.
-
-        Return:
-            LightningModule with batchnorm layers synchronized between process groups
-        """
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
-
-        return model
-
-    def sync_tensor(self,
-                    tensor: Union[torch.Tensor],
-                    group: Optional[Any] = None,
-                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
-        return sync_ddp_if_available(tensor, group, reduce_op)
-
-    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
-        """
-        Function to gather a tensor from several distributed processes
-
-        Args:
-            tensor: tensor of shape (batch, ...)
-            group: the process group to gather results from. Defaults to all processes (world)
-            sync_grads: flag that allows users to synchronize gradients for all_gather op
-
-        Return:
-            A tensor of shape (world_size, batch, ...)
-        """
-        return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads)
-
-    def get_reference_model(self, model) -> LightningModule:
-        return self.ddp_plugin.get_model_from_plugin(model)
-
-    @property
-    def distributed_sampler_kwargs(self):
-        distributed_sampler_kwargs = dict(
-            num_replicas=self.trainer.num_nodes,
-            rank=self.trainer.global_rank
-        )
-        if self.ddp_plugin is not None:
-            distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs)
-        return distributed_sampler_kwargs
-
-    @property
-    def require_distributed_sampler(self):
-        return True
diff --git a/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py
deleted file mode 100644
index 7db8e3defdb21..0000000000000
--- a/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-from typing import Optional
-
-from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator
-from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-
-
-class DDPCPUHPCAccelerator(DDPHPCAccelerator):
-
-    def __init__(self,
-                 trainer,
-                 cluster_environment: Optional[ClusterEnvironment] = None,
-                 ddp_plugin: Optional[DDPPlugin] = None):
-        """
-        Runs training using DDP (with CPUs) strategy on a cluster
-
-        Example::
-
-            # default
-            trainer = Trainer(accelerator=DDPCPUHPCAccelerator())
-
-        """
-        super().__init__(trainer, cluster_environment, ddp_plugin)
-        self.nickname = 'ddp_cpu'
-
-    def model_to_device(self, model, process_idx):
-        # Todo: required argument `process_idx` is not used
-        model.cpu()
-
-    def get_device_ids(self):
-        device_ids = None
-        return device_ids
-
-    def init_device(self, process_idx):
-        pass
diff --git a/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py
deleted file mode 100644
index b15b9e8062257..0000000000000
--- a/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-import os
-from typing import Any, List, Optional, Union
-
-import torch
-import torch.distributed as torch_distrib
-import torch.multiprocessing as mp
-from torch.nn.parallel import DistributedDataParallel
-
-from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
-from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.distributed import (
-    all_gather_ddp_if_available,
-    find_free_network_port,
-    rank_zero_only,
-    rank_zero_warn,
-    sync_ddp_if_available,
-)
-
-
-class DDPCPUSpawnAccelerator(Accelerator):
-
-    def __init__(self,
-                 trainer,
-                 nprocs: int,
-                 cluster_environment: Optional[ClusterEnvironment] = None,
-                 ddp_plugin: Optional[DDPPlugin] = None):
-        """
-        Runs training using DDP (on a single machine or manually on multiple machines), using mp.spawn
-
-        Example::
-
-            # default
-            trainer = Trainer(accelerator=DDPCPUSpawnAccelerator())
-
-        """
-        super().__init__(trainer, cluster_environment, ddp_plugin)
-        self.mp_queue = None
-        self.nprocs = nprocs
-        self.dist = LightningDistributed()
-        self.nickname = 'ddp_cpu'
-
-    def setup(self, model):
-        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
-
-        # pass in a state q
-        smp = mp.get_context('spawn')
-        self.mp_queue = smp.SimpleQueue()
-
-        self.trainer.model = model
-
-    def train(self):
-        model = self.trainer.model
-
-        # train in children process
-        mp.spawn(self.ddp_train, nprocs=self.nprocs, args=(self.mp_queue, model,))
-
-        # restore main state with best weights
-        best_path = self.mp_queue.get()
-        results = self.mp_queue.get()
-
-        # recover the weights of the processes trained in the children
-        self.__recover_child_process_weights(model, best_path)
-        return results
-
-    def ddp_train(self, process_idx, mp_queue, model):
-        """
-        Entry point for ddp
-
-        Args:
-            process_idx:
-            mp_queue: multiprocessing queue
-            model:
-        """
-        # show progressbar only on progress_rank 0
-        if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
-            self.trainer.progress_bar_callback.disable()
-
-        # determine which process we are and world size
-        self.set_world_ranks(process_idx)
-
-        # set warning rank
-        rank_zero_only.rank = self.trainer.global_rank
-
-        # set up server using proc 0's ip address
-        # try to init for 20 times at max in case ports are taken
-        # where to store ip_table
-        model.trainer = self.trainer
-        self.init_ddp_connection(
-            self.trainer.global_rank,
-            self.trainer.world_size,
-            self.trainer.is_slurm_managing_tasks
-        )
-
-        if isinstance(self.ddp_plugin, RPCPlugin):
-            if not self.ddp_plugin.is_main_rpc_process:
-                self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer)
-                self.ddp_plugin.exit_rpc_process()
-                if self.ddp_plugin.return_after_exit_rpc_process:
-                    return
-            else:
-                self.ddp_plugin.on_main_rpc_connection(self.trainer)
-
-        # call setup after the ddp process has connected
-        self.trainer.call_setup_hook(model)
-
-        # on world_size=0 let everyone know training is starting
-        if self.trainer.is_global_zero and not torch.distributed.is_initialized():
-            log.info('-' * 100)
-            log.info(f'distributed_backend={self.trainer.distributed_backend}')
-            log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
-            log.info('-' * 100)
-
-        # call sync_bn before .cuda(), configure_apex and configure_ddp
-        if self.trainer.sync_batchnorm:
-            model = self.configure_sync_batchnorm(model)
-
-        # move the model to the correct device
-        self.model_to_device(model, process_idx)
-
-        # CHOOSE OPTIMIZER
-        # allow for lr schedulers as well
-        self.setup_optimizers(model)
-
-        self.ddp_plugin.on_after_setup_optimizers(self.trainer)
-
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
-        # 16-bit
-        model = self.trainer.precision_connector.connect(model)
-
-        self.trainer.convert_to_lightning_optimizers()
-
-        # DDP spawn already spawned off each process... no need to do anything
-        device_ids = self.get_device_ids()
-
-        # allow user to configure ddp
-        model = self.configure_ddp(model, device_ids)
-
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-
-        # get original model
-        model = self.trainer.get_model()
-
-        # persist info in ddp_spawn
-        self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
-
-        # clean up memory
-        torch.cuda.empty_cache()
-
-    def training_step(self, args):
-        return self._step(args)
-
-    def validation_step(self, args):
-        return self._step(args)
-
-    def test_step(self, args):
-        return self._step(args)
-
-    def _step(self, args):
-        args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args)
-        if self.trainer.amp_backend == AMPType.NATIVE:
-            with torch.cuda.amp.autocast():
-                output = self.trainer.model(*args)
-        else:
-            output = self.trainer.model(*args)
-        return output
-
-    def barrier(self, name: Optional[str] = None):
-        if torch_distrib.is_initialized():
-            torch_distrib.barrier()
-
-    def broadcast(self, obj, src=0):
-        return self.dist.broadcast(obj)
-
-    def early_stopping_should_stop(self, pl_module):
-        stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device)
-        torch_distrib.all_reduce(stop, op=torch_distrib.reduce_op.SUM)
-        torch_distrib.barrier()
-        should_stop = stop == self.trainer.world_size
-        return should_stop
-
-    def set_world_ranks(self, process_idx):
-        self.trainer.local_rank = process_idx
-        self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx
-        self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes
-
-    def model_to_device(self, model, process_idx):
-        # Todo: required argument `process_idx` is not used
-        model.cpu()
-
-    def get_device_ids(self):
-        device_ids = None
-        return device_ids
-
-    def __recover_child_process_weights(self, model, best_path):
-        # transfer back the best path to the trainer
-        if self.trainer.checkpoint_callback:
-            self.trainer.checkpoint_callback.best_model_path = best_path
-
-        self.trainer.model = model
-
-    def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
-        # Todo: required argument `model` is not used
-        # track the best model path
-        best_model_path = None
-        if self.trainer.checkpoint_callback is not None:
-            best_model_path = self.trainer.checkpoint_callback.best_model_path
-
-        if self.trainer.global_rank == 0 and mp_queue is not None:
-            rank_zero_warn('cleaning up ddp environment...')
-            # todo, pass complete checkpoint as state dictionary
-            mp_queue.put(best_model_path)
-            mp_queue.put(results)
-
-    def configure_ddp(
-            self, model: LightningModule, device_ids: List[int]
-    ) -> DistributedDataParallel:
-        model = self.ddp_plugin.configure_ddp(model, device_ids)
-        return model
-
-    def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
-        """
-        Add global batchnorm for a model spread across multiple GPUs and nodes.
-
-        Override to synchronize batchnorm between specific process groups instead
-        of the whole world or use a different sync_bn like `apex`'s version.
-
-        Args:
-            model: pointer to current :class:`LightningModule`.
-
-        Return:
-            LightningModule with batchnorm layers synchronized between process groups
-        """
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
-
-        return model
-
-    def sync_tensor(self,
-                    tensor: Union[torch.Tensor],
-                    group: Optional[Any] = None,
-                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
-        return sync_ddp_if_available(tensor, group, reduce_op)
-
-    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
-        """
-        Function to gather a tensor from several distributed processes
-
-        Args:
-            tensor: tensor of shape (batch, ...)
-            group: the process group to gather results from. Defaults to all processes (world)
-            sync_grads: flag that allows users to synchronize gradients for all_gather op
-
-        Return:
-            A tensor of shape (world_size, batch, ...)
-        """
-        return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads)
-
-    def get_reference_model(self, model) -> LightningModule:
-        return self.ddp_plugin.get_model_from_plugin(model)
-
-    @property
-    def distributed_sampler_kwargs(self):
-        distributed_sampler_kwargs = dict(
-            num_replicas=self.trainer.num_nodes * self.trainer.num_processes,
-            rank=self.trainer.global_rank
-        )
-        if self.ddp_plugin is not None:
-            distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs)
-        return distributed_sampler_kwargs
-
-    @property
-    def require_distributed_sampler(self):
-        return True
diff --git a/pytorch_lightning/accelerators/old/dp_accelerator.py b/pytorch_lightning/accelerators/old/dp_accelerator.py
deleted file mode 100644
index 847d156d4f11d..0000000000000
--- a/pytorch_lightning/accelerators/old/dp_accelerator.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-import torch
-from torch import optim
-
-from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.core.step_result import Result
-from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDataParallel
-from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-
-
-class DataParallelAccelerator(Accelerator):
-
-    def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None):
-        """
-        Runs training using DP via manual start (not HPC cluster)
-
-        Example::
-
-            # default
-            trainer = Trainer(accelerator=DataParallelAccelerator())
-
-        """
-        super().__init__(trainer, cluster_environment)
-        self.model_autocast_original_forward = None
-        self.dist = LightningDistributed()
-        self.nickname = 'dp'
-
-    def setup(self, model):
-        # call setup after the ddp process has connected
-        self.trainer.call_setup_hook(model)
-
-        # put model on correct device
-        model.cuda(self.trainer.root_gpu)
-
-        # CHOOSE OPTIMIZER
-        # allow for lr schedulers as well
-        self.setup_optimizers(model)
-
-        # init torch data parallel
-        model = self.__init_torch_data_parallel(model)
-
-        # hack forward to do autocast for the user
-        self.model_autocast_original_forward = model.forward
-
-        # init half precision
-        if self.trainer.amp_backend:
-            model = self.__init_half_precision(model)
-
-        self.trainer.convert_to_lightning_optimizers()
-
-        self.trainer.model = model
-
-    def __init_torch_data_parallel(self, model):
-        # create list of device ids
-        device_ids = self.trainer.data_parallel_device_ids
-        if isinstance(device_ids, int):
-            device_ids = list(range(device_ids))
-
-        # set dp device
-        torch.cuda.set_device(self.trainer.root_gpu)
-        model = LightningDataParallel(model, device_ids=device_ids)
-        return model
-
-    def __init_half_precision(self, model):
-        if self.trainer.amp_backend == AMPType.NATIVE:
-            self.__init_native_amp(model)
-        else:
-            model = self.__init_nvidia_apex(model)
-        return model
-
-    def __init_native_amp(self, model):
-        model.forward = torch.cuda.amp.autocast()(model.forward)
-
-    def __init_nvidia_apex(self, model):
-        # check for this bug (amp + dp + !01 doesn't work)
-        # https://github.com/NVIDIA/apex/issues/227
-        if self.trainer.amp_level == 'O2':
-            raise MisconfigurationException(
-                f'Amp level {self.trainer.amp_level} with DataParallel is not supported.'
-                f' See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.'
-                f' We recommend you switch to ddp if you want to use amp')
-        else:
-            model = self.trainer.precision_connector.connect(model)
-
-        return model
-
-    def train(self):
-        model = self.trainer.model
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-
-        return results
-
-    def teardown(self):
-        # replace the original fwd function
-        self.trainer.model.forward = self.model_autocast_original_forward
-        self.barrier()
-
-    def _step(self, args):
-        if self.trainer.amp_backend == AMPType.NATIVE:
-            with torch.cuda.amp.autocast():
-                output = self.trainer.model(*args)
-        else:
-            output = self.trainer.model(*args)
-        return output
-
-    def training_step(self, args):
-        return self._step(args)
-
-    def validation_step(self, args):
-        return self._step(args)
-
-    def test_step(self, args):
-        return self._step(args)
-
-    def training_step_end(self, output):
-        if isinstance(output, Result):
-            output.dp_reduce()
-        elif isinstance(output, torch.Tensor):
-            output = output.mean()
-        return output
-
-    def validation_step_end(self, output):
-        if isinstance(output, Result):
-            output.dp_reduce()
-        elif isinstance(output, torch.Tensor):
-            output = output.mean()
-        return output
-
-    def test_step_end(self, output):
-        if isinstance(output, Result):
-            output.dp_reduce()
-        elif isinstance(output, torch.Tensor):
-            output = output.mean()
-        return output
-
-    def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
-        """
-        Reinitialize optimizer.step properties added by schedulers
-        """
-        for scheduler in schedulers:
-            scheduler = scheduler['scheduler']
-
-            for optimizer in optimizers:
-                # check that we dont mix users optimizers and schedulers
-                if scheduler.optimizer == optimizer:
-                    # Find the mro belonging to the base lr scheduler class
-                    for i, mro in enumerate(scheduler.__class__.__mro__):
-                        is_regular_scheduler = optim.lr_scheduler._LRScheduler
-                        is_lr_reduce_on_plateau = optim.lr_scheduler.ReduceLROnPlateau
-                        if is_regular_scheduler or is_lr_reduce_on_plateau:
-                            idx = i
-                            state = scheduler.state_dict()
-                        else:
-                            state = None
-
-                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
-                if state is not None:
-                    scheduler.load_state_dict(state)
-
-    def get_reference_model(self, model) -> LightningModule:
-        if isinstance(model, LightningDataParallel):
-            return model.module
-        return model
-
-    @property
-    def require_distributed_sampler(self):
-        return False
diff --git a/pytorch_lightning/accelerators/old/gpu_accelerator.py b/pytorch_lightning/accelerators/old/gpu_accelerator.py
deleted file mode 100644
index 2fe3b26679f5c..0000000000000
--- a/pytorch_lightning/accelerators/old/gpu_accelerator.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Callable, Optional, Union
-
-import torch
-
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
-from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.utilities import AMPType
-
-
-class GPUAccelerator(Accelerator):
-    amp_backend: AMPType
-
-    def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None):
-        """
-        Runs training using a single GPU
-
-        Example::
-
-            # default
-            trainer = Trainer(accelerator=GPUAccelerator())
-
-        """
-        super().__init__(trainer, cluster_environment)
-        self.dist = LightningDistributed()
-        self.nickname = None
-
-    def setup(self, model):
-
-        # call setup
-        self.trainer.call_setup_hook(model)
-
-        torch.cuda.set_device(self.trainer.root_gpu)
-        model.cuda(self.trainer.root_gpu)
-
-        # CHOOSE OPTIMIZER
-        # allow for lr schedulers as well
-        self.setup_optimizers(model)
-
-        # 16-bit
-        model = self.trainer.precision_connector.connect(model)
-
-        self.trainer.convert_to_lightning_optimizers()
-
-        self.trainer.model = model
-
-    def train(self):
-        model = self.trainer.model
-
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-        return results
-
-    def _step(self, model_step: Callable, args):
-        args[0] = self.to_device(args[0])
-
-        if self.trainer.amp_backend == AMPType.NATIVE:
-            with torch.cuda.amp.autocast():
-                output = model_step(*args)
-        else:
-            output = model_step(*args)
-
-        return output
-
-    def training_step(self, args):
-        return self._step(self.trainer.model.training_step, args)
-
-    def validation_step(self, args):
-        return self._step(self.trainer.model.validation_step, args)
-
-    def test_step(self, args):
-        return self._step(self.trainer.model.test_step, args)
-
-    def to_device(self, batch):
-        gpu_id = 0
-        if isinstance(self.trainer.data_parallel_device_ids, list):
-            gpu_id = self.trainer.data_parallel_device_ids[0]
-
-        # Don't copy the batch since there is a single gpu that the batch could
-        # be referenced from and if there are multiple optimizers the batch will
-        # wind up copying it to the same device repeatedly.
-        return self.batch_to_device(batch, gpu_id)
-
-    def sync_tensor(self,
-                    tensor: Union[torch.Tensor],
-                    group: Optional[Any] = None,
-                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
-        return tensor
-
-    @property
-    def require_distributed_sampler(self):
-        return False

From dfcbba6241376f4f7b8c17bae4d37e4218089ec8 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 6 Jan 2021 17:31:19 +0100
Subject: [PATCH 095/157] fix naming

---
 pytorch_lightning/accelerators/accelerator.py | 58 +----------
 .../accelerators/accelerator_connector.py     | 10 +-
 pytorch_lightning/accelerators/cpu.py         | 14 +++
 pytorch_lightning/accelerators/gpu.py         | 25 +++++
 pytorch_lightning/accelerators/tpu.py         | 13 +++
 pytorch_lightning/trainer/data_loading.py     |  4 +-
 pytorch_lightning/trainer/properties.py       |  4 +-
 pytorch_lightning/trainer/trainer.py          |  4 +-
 test.py                                       | 97 +++++++++++++++++++
 tests/backends/test_accelerator_connector.py  | 32 +++---
 tests/core/test_datamodules.py                |  2 +-
 tests/models/test_hooks.py                    | 30 +++---
 tests/models/test_horovod.py                  |  4 +-
 tests/models/test_tpu.py                      |  4 +-
 14 files changed, 197 insertions(+), 104 deletions(-)
 create mode 100644 pytorch_lightning/accelerators/cpu.py
 create mode 100644 pytorch_lightning/accelerators/gpu.py
 create mode 100644 pytorch_lightning/accelerators/tpu.py
 create mode 100644 test.py

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 07777d982b2d6..81eb112206d28 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -21,7 +21,7 @@
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 
 
-class NewAccelerator(object):
+class Accelerator(object):
     def __init__(
         self,
         precision_plugin: PrecisionPlugin,
@@ -161,15 +161,6 @@ def _clip_gradients(self, optimizer, grad_clip_val):
         if grad_clip_val <= 0:
             return
 
-        # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX
-
-        # if self.trainer.amp_backend == AMPType.APEX:
-        #     parameters = self.precision_plugin.master_params(optimizer)
-        # else:
-        #     parameters = model.parameters()
-
-        # TODO
-        #  ... or we call master_params() and in the default plugin we return the model.parameters()
         parameters = self.precision_plugin.master_params(optimizer)
 
         max_norm = grad_clip_val
@@ -246,7 +237,6 @@ def scaler(self):
     def rpc_enabled(self):
         return self.training_type_plugin.rpc_enabled
 
-    # TODO: Check where this comes from and why it is needed
     def optimizer_state(self, optimizer: Optimizer) -> dict:
         """
         Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom
@@ -259,48 +249,4 @@ def optimizer_state(self, optimizer: Optimizer) -> dict:
         return optimizer.state_dict()
 
     def on_save(self, checkpoint):
-        return checkpoint
-
-
-class NewCPUAccelerator(NewAccelerator):
-    def setup(self, trainer, model):
-        if isinstance(self.precision_plugin, MixedPrecisionPlugin):
-            MisconfigurationException("amp + cpu is not supported.  Please use a GPU option")
-
-        if "cpu" not in str(self.root_device):
-            raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead")
-
-        return super().setup(trainer, model)
-
-
-class NewGPUAccelerator(NewAccelerator):
-    def setup(self, trainer, model):
-        if "cuda" not in str(self.root_device):
-            raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
-        torch.cuda.set_device(self.root_device)
-        model.to(self.root_device)
-
-        return super().setup(trainer, model)
-
-    def on_train_start(self):
-        # clear cache before training
-        # use context because of:
-        # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
-
-    def on_train_end(self):
-        # clean up memory
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
-
-# TODO: Complete the TPUAccelerator
-class NewTPUAccelerator(NewAccelerator):
-    def setup(self, trainer, model):
-        raise NotImplementedError
-
-    def on_train_start(self):
-        raise NotImplementedError
-
-    def on_train_end(self):
-        raise NotImplementedError
+        return checkpoint
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 7addf4bdd72c2..e03e51cbba6ed 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -15,7 +15,9 @@
 import os
 import torch
 
-from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.cpu import CPUAccelerator
+from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
     DataParallelPlugin, DDP2Plugin, HorovodPlugin
 from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
@@ -241,14 +243,14 @@ def select_training_type_plugin(self):
         return plugin
 
     def select_accelerator(self):
-        if isinstance(self.distributed_backend, NewAccelerator):
+        if isinstance(self.distributed_backend, Accelerator):
             # custom accelerator from user
             return self.distributed_backend
 
         if self.on_gpu:
-            acc_cls = NewGPUAccelerator
+            acc_cls = GPUAccelerator
         else:
-            acc_cls = NewCPUAccelerator
+            acc_cls = CPUAccelerator
 
         return acc_cls(
             precision_plugin=self.select_precision_plugin(),
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
new file mode 100644
index 0000000000000..e9f49e20a464f
--- /dev/null
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -0,0 +1,14 @@
+from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin
+from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+
+class CPUAccelerator(Accelerator):
+    def setup(self, trainer, model):
+        if isinstance(self.precision_plugin, MixedPrecisionPlugin):
+            MisconfigurationException("amp + cpu is not supported.  Please use a GPU option")
+
+        if "cpu" not in str(self.root_device):
+            raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead")
+
+        return super().setup(trainer, model)
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
new file mode 100644
index 0000000000000..7b2cbe3627e0b
--- /dev/null
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -0,0 +1,25 @@
+import torch
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.accelerators.accelerator import Accelerator
+
+
+class GPUAccelerator(Accelerator):
+    def setup(self, trainer, model):
+        if "cuda" not in str(self.root_device):
+            raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
+        torch.cuda.set_device(self.root_device)
+        model.to(self.root_device)
+
+        return super().setup(trainer, model)
+
+    def on_train_start(self):
+        # clear cache before training
+        # use context because of:
+        # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()
+
+    def on_train_end(self):
+        # clean up memory
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
new file mode 100644
index 0000000000000..bf922b1c2df8e
--- /dev/null
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -0,0 +1,13 @@
+# TODO: Complete the TPUAccelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator
+
+
+class TPUAccelerator(Accelerator):
+    def setup(self, trainer, model):
+        raise NotImplementedError
+
+    def on_train_start(self):
+        raise NotImplementedError
+
+    def on_train_end(self):
+        raise NotImplementedError
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index cc5fc492b3a6a..4c77f353c0688 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -21,7 +21,7 @@
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
-from pytorch_lightning.accelerators.accelerator import NewAccelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import rank_zero_warn
@@ -51,7 +51,7 @@ class TrainerDataLoadingMixin(ABC):
     limit_val_batches: Union[int, float]
     limit_test_batches: Union[int, float]
     replace_sampler_ddp: bool
-    accelerator_backend: NewAccelerator
+    accelerator_backend: Accelerator
     num_nodes: int
     num_processes: int
     distributed_backend: Optional[str]
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 62241722ff365..494e91a298843 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -17,7 +17,7 @@
 from argparse import ArgumentParser, Namespace
 from typing import cast, List, Optional, Type, TypeVar, Union
 
-from pytorch_lightning.accelerators.accelerator import NewAccelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
 from pytorch_lightning.callbacks import Callback, ProgressBarBase, ModelCheckpoint, EarlyStopping
 from pytorch_lightning.core.lightning import LightningModule
@@ -63,7 +63,7 @@ class TrainerProperties(ABC):
     limit_val_batches: int
     _default_root_dir: str
     _weights_save_path: str
-    accelerator_backend: NewAccelerator
+    accelerator_backend: Accelerator
     num_nodes: int
     num_processes: int
     accelerator_connector: BackendConnector
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 382b6e3c5ae8e..4d0718c5e2b48 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -79,7 +79,7 @@
 from pytorch_lightning.utilities.model_utils import is_overridden
 from pytorch_lightning.trainer.properties import TrainerProperties
 from pytorch_lightning.plugins.plugin_connector import PluginConnector
-from pytorch_lightning.accelerators.accelerator import NewAccelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator
 
 # warnings to ignore in trainer
 warnings.filterwarnings(
@@ -129,7 +129,7 @@ def __init__(
         val_check_interval: Union[int, float] = 1.0,
         flush_logs_every_n_steps: int = 100,
         log_every_n_steps: int = 50,
-        accelerator: Optional[Union[str, NewAccelerator]] = None,
+        accelerator: Optional[Union[str, Accelerator]] = None,
         sync_batchnorm: bool = False,
         precision: int = 32,
         weights_summary: Optional[str] = "top",
diff --git a/test.py b/test.py
new file mode 100644
index 0000000000000..959436c179c21
--- /dev/null
+++ b/test.py
@@ -0,0 +1,97 @@
+import torch
+import pytorch_lightning as pl
+
+class RandomDataset(torch.utils.data.Dataset):
+    def __init__(self, size, length):
+        self.len = length
+        self.data = torch.randn(length, size)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+class BoringModel(pl.LightningModule):
+
+    def __init__(self):
+        """
+        Testing PL Module
+
+        Use as follows:
+        - subclass
+        - modify the behavior for what you want
+
+        class TestModel(BaseTestModel):
+            def training_step(...):
+                # do your own thing
+
+        or:
+
+        model = BaseTestModel()
+        model.training_epoch_end = None
+
+        """
+        super().__init__()
+        self.layer = torch.nn.Linear(32, 2)
+
+    def forward(self, x):
+        return self.layer(x)
+
+    def loss(self, batch, prediction):
+        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
+        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
+
+    def step(self, x):
+        x = self(x)
+        out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
+        return out
+
+    def training_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        return {"loss": loss}
+
+    def training_step_end(self, training_step_outputs):
+        return training_step_outputs
+
+    def training_epoch_end(self, outputs) -> None:
+        torch.stack([x["loss"] for x in outputs]).mean()
+
+    def validation_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        return {"x": loss}
+
+    # def validation_epoch_end(self, outputs) -> None:
+    #     torch.stack([x['x'] for x in outputs]).mean()
+
+    def test_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        return {"y": loss}
+
+    def test_epoch_end(self, outputs) -> None:
+        torch.stack([x["y"] for x in outputs]).mean()
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+        return [optimizer], [lr_scheduler]
+
+    def train_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    def test_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    @property
+    def automatic_optimization(self):
+        return True
+
+if __name__ == '__main__':
+    pl.Trainer(gpus=[1,], max_epochs=20, amp_backend='native').fit(BoringModel(), torch.utils.data.DataLoader(RandomDataset(32, 500)))
\ No newline at end of file
diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index 37a1911be38d3..b6f27f32a85fc 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -18,10 +18,12 @@
 import pytest
 import torch
 
-from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewGPUAccelerator, NewAccelerator
+from pytorch_lightning import Trainer
+from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.cpu import CPUAccelerator
+from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin
 from pytorch_lightning.accelerators.precision import PrecisionPlugin
-from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
 from tests.base.boring_model import BoringModel
@@ -31,7 +33,7 @@ def test_accelerator_choice_cpu(tmpdir):
     trainer = Trainer(
         fast_dev_run=True,
     )
-    assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+    assert isinstance(trainer.accelerator_backend, CPUAccelerator)
     assert isinstance(trainer.training_type_plugin, SingleDevicePlugin)
 
 
@@ -40,7 +42,7 @@ def test_accelerator_choice_ddp_cpu(tmpdir):
         fast_dev_run=True,
         accelerator='ddp_cpu',
     )
-    assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+    assert isinstance(trainer.accelerator_backend, CPUAccelerator)
     assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
     assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
@@ -53,7 +55,7 @@ def test_accelerator_choice_ddp(tmpdir):
         accelerator='ddp',
         gpus=1,
     )
-    assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+    assert isinstance(trainer.accelerator_backend, GPUAccelerator)
     assert isinstance(trainer.training_type_plugin, DDPPlugin)
     assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
@@ -66,7 +68,7 @@ def test_accelerator_choice_ddp_spawn(tmpdir):
         accelerator='ddp_spawn',
         gpus=1,
     )
-    assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+    assert isinstance(trainer.accelerator_backend, GPUAccelerator)
     assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
     assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
@@ -84,7 +86,7 @@ class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             assert trainer.use_ddp
             assert trainer.accelerator_connector.is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
             assert trainer.training_type_plugin.task_idx == 10
@@ -117,7 +119,7 @@ class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             assert trainer.use_ddp2
             assert trainer.accelerator_connector.is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDP2Plugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
             assert trainer.training_type_plugin.task_idx == 10
@@ -148,7 +150,7 @@ def test_accelerator_choice_ddp_te(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             assert trainer.use_ddp
-            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
             assert trainer.training_type_plugin.task_idx == 10
@@ -178,7 +180,7 @@ def test_accelerator_choice_ddp2_te(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             assert trainer.use_ddp2
-            assert isinstance(trainer.accelerator_backend, NewGPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDP2Plugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
             assert trainer.training_type_plugin.task_idx == 10
@@ -207,7 +209,7 @@ def test_accelerator_choice_ddp_cpu_te(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             assert trainer.use_ddp
-            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
             assert trainer.training_type_plugin.task_idx == 10
@@ -239,7 +241,7 @@ class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             assert trainer.use_ddp
             assert trainer.accelerator_connector.is_slurm_managing_tasks
-            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
             raise SystemExit()
@@ -276,7 +278,7 @@ def master_address(self):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             assert trainer.use_ddp
-            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster)
             raise SystemExit()
@@ -303,7 +305,7 @@ def on_fit_start(self, trainer, pl_module):
 })
 @mock.patch('torch.cuda.device_count', return_value=0)
 def test_custom_accelerator(tmpdir):
-    class Accel(NewAccelerator):
+    class Accel(Accelerator):
         pass
 
     class Prec(PrecisionPlugin):
@@ -337,7 +339,7 @@ class TrainTypePlugin(SingleDevicePlugin):
 def test_dist_backend_accelerator_mapping(tmpdir):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             raise SystemExit()
 
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 7796c9c074d6e..c28e1bdb8d658 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -398,7 +398,7 @@ def test_full_loop_dp(tmpdir):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock)
+@mock.patch("pytorch_lightning.accelerators.accelerator.Accelerator.lightning_module", new_callable=PropertyMock)
 def test_dm_transfer_batch_to_device(get_module_mock):
     class CustomBatch:
         def __init__(self, data):
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index b2491389135f2..cfcd680cb0080 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -55,20 +55,19 @@ def test_training_epoch_end_metrics_collection(tmpdir):
     num_epochs = 3
 
     class CurrentModel(EvalModelTemplate):
-
         def training_step(self, *args, **kwargs):
             output = super().training_step(*args, **kwargs)
-            output['progress_bar'].update({'step_metric': torch.tensor(-1)})
-            output['progress_bar'].update({'shared_metric': 100})
+            output["progress_bar"].update({"step_metric": torch.tensor(-1)})
+            output["progress_bar"].update({"shared_metric": 100})
             return output
 
         def training_epoch_end(self, outputs):
             epoch = self.current_epoch
             # both scalar tensors and Python numbers are accepted
             return {
-                'progress_bar': {
-                    f'epoch_metric_{epoch}': torch.tensor(epoch),  # add a new metric key every epoch
-                    'shared_metric': 111,
+                "progress_bar": {
+                    f"epoch_metric_{epoch}": torch.tensor(epoch),  # add a new metric key every epoch
+                    "shared_metric": 111,
                 }
             }
 
@@ -83,20 +82,18 @@ def training_epoch_end(self, outputs):
     metrics = trainer.progress_bar_dict
 
     # metrics added in training step should be unchanged by epoch end method
-    assert metrics['step_metric'] == -1
+    assert metrics["step_metric"] == -1
     # a metric shared in both methods gets overwritten by epoch_end
-    assert metrics['shared_metric'] == 111
+    assert metrics["shared_metric"] == 111
     # metrics are kept after each epoch
     for i in range(num_epochs):
-        assert metrics[f'epoch_metric_{i}'] == i
+        assert metrics[f"epoch_metric_{i}"] == i
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock)
+@mock.patch("pytorch_lightning.accelerators.accelerator.Accelerator.lightning_module", new_callable=PropertyMock)
 def test_transfer_batch_hook(model_getter_mock):
-
     class CustomBatch:
-
         def __init__(self, data):
             self.samples = data[0]
             self.targets = data[1]
@@ -120,16 +117,13 @@ def transfer_batch_to_device(self, data, device):
     trainer = Trainer(gpus=1)
     # running .fit() would require us to implement custom data loaders, we mock the model reference instead
     model_getter_mock.return_value = model
-    batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0'))
-    expected = torch.device('cuda', 0)
+    batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device("cuda:0"))
+    expected = torch.device("cuda", 0)
     assert model.hook_called
     assert batch_gpu.samples.device == batch_gpu.targets.device == expected
 
 
-@pytest.mark.parametrize(
-    'max_epochs,batch_idx_',
-    [(2, 5), (3, 8), (4, 12)]
-)
+@pytest.mark.parametrize("max_epochs,batch_idx_", [(2, 5), (3, 8), (4, 12)])
 def test_on_train_batch_start_hook(max_epochs, batch_idx_):
     class CurrentModel(EvalModelTemplate):
         def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 623f329035533..ca56a987aab98 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -26,7 +26,7 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator
+from pytorch_lightning.accelerators.accelerator import CPUAccelerator
 from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
@@ -312,7 +312,7 @@ def _compute_batch():
             accelerator='horovod',
         )
 
-        assert isinstance(trainer.accelerator_backend, NewCPUAccelerator)
+        assert isinstance(trainer.accelerator_backend, CPUAccelerator)
         # TODO: test that we selected the correct training_type_plugin based on horovod flags
 
         metric = Accuracy(compute_on_step=True,
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 45cd9b2154c43..8278ef60dc6bd 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -20,7 +20,7 @@
 
 import tests.base.develop_pipelines as tpipes
 from pytorch_lightning import Trainer, seed_everything
-from pytorch_lightning.accelerators.accelerator import NewTPUAccelerator
+from pytorch_lightning.accelerators.accelerator import TPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE
@@ -250,7 +250,7 @@ def test_broadcast_on_tpu():
     """ Checks if an object from the master process is broadcasted to other processes correctly"""
     def test_broadcast(rank):
         trainer = Trainer(tpu_cores=8)
-        assert isinstance(trainer.accelerator_backend, NewTPUAccelerator)
+        assert isinstance(trainer.accelerator_backend, TPUAccelerator)
         obj = ("ver_0.5", "logger_name", rank)
         result = trainer.accelerator_backend.broadcast(obj)
         assert result == ("ver_0.5", "logger_name", 0)

From 348a1b04efd006a1694b3415ca28d166e0862f68 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 6 Jan 2021 18:14:25 +0100
Subject: [PATCH 096/157] move old plugins

---
 pytorch_lightning/plugins/__init__.py         |  1 +
 pytorch_lightning/plugins/old/__init__.py     |  0
 pytorch_lightning/plugins/{ => old}/apex.py   |  0
 .../plugins/{ => old}/ddp_plugin.py           |  0
 .../{ => old}/ddp_sequential_plugin.py        |  0
 .../plugins/{ => old}/native_amp.py           |  0
 pytorch_lightning/plugins/{ => old}/plugin.py |  0
 .../plugins/{ => old}/plugin_connector.py     |  0
 .../plugins/{ => old}/precision_plugin.py     |  0
 .../plugins/{ => old}/rpc_plugin.py           |  0
 .../{ => old}/sharded_native_amp_plugin.py    |  0
 .../plugins/{ => old}/sharded_plugin.py       |  0
 pytorch_lightning/trainer/optimizers.py       | 21 -------------------
 13 files changed, 1 insertion(+), 21 deletions(-)
 create mode 100644 pytorch_lightning/plugins/old/__init__.py
 rename pytorch_lightning/plugins/{ => old}/apex.py (100%)
 rename pytorch_lightning/plugins/{ => old}/ddp_plugin.py (100%)
 rename pytorch_lightning/plugins/{ => old}/ddp_sequential_plugin.py (100%)
 rename pytorch_lightning/plugins/{ => old}/native_amp.py (100%)
 rename pytorch_lightning/plugins/{ => old}/plugin.py (100%)
 rename pytorch_lightning/plugins/{ => old}/plugin_connector.py (100%)
 rename pytorch_lightning/plugins/{ => old}/precision_plugin.py (100%)
 rename pytorch_lightning/plugins/{ => old}/rpc_plugin.py (100%)
 rename pytorch_lightning/plugins/{ => old}/sharded_native_amp_plugin.py (100%)
 rename pytorch_lightning/plugins/{ => old}/sharded_plugin.py (100%)

diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index e69de29bb2d1d..b416a9f56aebe 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -0,0 +1 @@
+from pytorch_lightning.accelerators.plugins import *
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/old/__init__.py b/pytorch_lightning/plugins/old/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/old/apex.py
similarity index 100%
rename from pytorch_lightning/plugins/apex.py
rename to pytorch_lightning/plugins/old/apex.py
diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/old/ddp_plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/ddp_plugin.py
rename to pytorch_lightning/plugins/old/ddp_plugin.py
diff --git a/pytorch_lightning/plugins/ddp_sequential_plugin.py b/pytorch_lightning/plugins/old/ddp_sequential_plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/ddp_sequential_plugin.py
rename to pytorch_lightning/plugins/old/ddp_sequential_plugin.py
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/old/native_amp.py
similarity index 100%
rename from pytorch_lightning/plugins/native_amp.py
rename to pytorch_lightning/plugins/old/native_amp.py
diff --git a/pytorch_lightning/plugins/plugin.py b/pytorch_lightning/plugins/old/plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/plugin.py
rename to pytorch_lightning/plugins/old/plugin.py
diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/old/plugin_connector.py
similarity index 100%
rename from pytorch_lightning/plugins/plugin_connector.py
rename to pytorch_lightning/plugins/old/plugin_connector.py
diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/old/precision_plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/precision_plugin.py
rename to pytorch_lightning/plugins/old/precision_plugin.py
diff --git a/pytorch_lightning/plugins/rpc_plugin.py b/pytorch_lightning/plugins/old/rpc_plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/rpc_plugin.py
rename to pytorch_lightning/plugins/old/rpc_plugin.py
diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/sharded_native_amp_plugin.py
rename to pytorch_lightning/plugins/old/sharded_native_amp_plugin.py
diff --git a/pytorch_lightning/plugins/sharded_plugin.py b/pytorch_lightning/plugins/old/sharded_plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/sharded_plugin.py
rename to pytorch_lightning/plugins/old/sharded_plugin.py
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index e56856dfb2b4f..33a7836ab974a 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -140,27 +140,6 @@ def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None):
                 raise ValueError(f'The provided lr scheduler "{scheduler}" is invalid')
         return lr_schedulers
 
-    def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
-        # Reinitialize optimizer.step properties added by schedulers
-        for scheduler in schedulers:
-            scheduler = scheduler['scheduler']
-
-            for optimizer in optimizers:
-                # check that we dont mix users optimizers and schedulers
-                if scheduler.optimizer == optimizer:
-                    # Find the mro belonging to the base lr scheduler class
-                    for i, mro in enumerate(scheduler.__class__.__mro__):
-                        if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau):
-                            idx = i
-                            state = scheduler.state_dict()
-                        else:
-                            state = None
-
-                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
-                if state is not None:
-                    scheduler.load_state_dict(state)
-
-
 class _MockOptimizer(Optimizer):
     """The `_MockOptimizer` will be used inplace of an optimizer in the event that `None`
     is returned from `configure_optimizers`.

From 14f2f6e9a8cd4438a305f6be1ae05320b370e8fd Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 6 Jan 2021 18:14:47 +0100
Subject: [PATCH 097/157] move to plugins

---
 pytorch_lightning/accelerators/{ => plugins}/base_plugin.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pytorch_lightning/accelerators/{ => plugins}/base_plugin.py (100%)

diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/plugins/base_plugin.py
similarity index 100%
rename from pytorch_lightning/accelerators/base_plugin.py
rename to pytorch_lightning/accelerators/plugins/base_plugin.py

From 2f779c618f2cc8bad00f2df978f971eb9ff08f1b Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 6 Jan 2021 18:15:18 +0100
Subject: [PATCH 098/157] create precision subpackage

---
 .../plugins/precision/__init__.py             |   4 +
 .../plugins/precision/apex_amp.py             | 115 +++++++++++
 .../accelerators/plugins/precision/mixed.py   |   7 +
 .../plugins/precision/native_amp.py           |  48 +++++
 .../plugins/precision/precision_plugin.py     |  45 +++++
 pytorch_lightning/accelerators/precision.py   | 189 ------------------
 6 files changed, 219 insertions(+), 189 deletions(-)
 create mode 100644 pytorch_lightning/accelerators/plugins/precision/__init__.py
 create mode 100644 pytorch_lightning/accelerators/plugins/precision/apex_amp.py
 create mode 100644 pytorch_lightning/accelerators/plugins/precision/mixed.py
 create mode 100644 pytorch_lightning/accelerators/plugins/precision/native_amp.py
 create mode 100644 pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
 delete mode 100644 pytorch_lightning/accelerators/precision.py

diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py
new file mode 100644
index 0000000000000..4f30fe58910f4
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/precision/__init__.py
@@ -0,0 +1,4 @@
+from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
new file mode 100644
index 0000000000000..9bb749bf18dbb
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
@@ -0,0 +1,115 @@
+from contextlib import contextmanager
+from typing import List, Tuple
+import torch
+from torch.optim import Optimizer
+from pytorch_lightning.core import LightningModule
+from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, rank_zero_warn
+from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
+
+if APEX_AVAILABLE:
+    from apex import amp
+
+class ApexMixedPrecisionPlugin(MixedPrecisionPlugin):
+    def __init__(self, amp_level):
+        self.backend = AMPType.APEX
+        self.amp_level = amp_level
+
+    def master_params(self, optimizer):
+        return amp.master_params(optimizer)
+
+    def connect(self, model, optimizers, lr_schedulers):
+        model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level)
+        self.reinit_scheduler_properties(optimizers, lr_schedulers)
+        return model, optimizers, lr_schedulers
+
+    def backward(
+        self,
+        model: LightningModule,
+        closure_loss: torch.Tensor,
+        optimizer: torch.optim.Optimizer,
+        opt_idx: int,
+        should_accumulate: bool,
+        *args,
+        **kwargs,
+    ):
+        closure_loss = amp.scale_loss(closure_loss, optimizer)
+
+        # enter apex context
+        context = closure_loss
+        closure_loss = closure_loss.__enter__()
+
+        # do backward pass
+        # TODO: not entirely sure, why we need this
+        if model is not None and isinstance(model, LightningModule):
+            model.backward(closure_loss, optimizer, opt_idx)
+        else:
+            closure_loss.backward(*args, **kwargs)
+
+        # exit amp context
+        a, b, c = None, None, None
+        error = context.__exit__(a, b, c)
+        if error:
+            rank_zero_warn(a, b, c)
+            raise Exception("apex unscale error")
+
+        # once backward has been applied, release graph
+        closure_loss = closure_loss.detach()
+        return closure_loss
+
+    def configure_apex(
+        self,
+        amp: object,
+        model: LightningModule,
+        optimizers: List[Optimizer],
+        amp_level: str,
+    ) -> Tuple[LightningModule, List[Optimizer]]:
+        r"""
+        Override to init AMP your own way.
+        Must return a model and list of optimizers.
+
+        Args:
+            amp: pointer to amp library object.
+            model: pointer to current :class:`LightningModule`.
+            optimizers: list of optimizers passed in :meth:`configure_optimizers`.
+            amp_level: AMP mode chosen ('O1', 'O2', etc...)
+
+        Return:
+            Apex wrapped model and optimizers
+
+        Examples:
+            .. code-block:: python
+
+                # Default implementation used by Trainer.
+                def configure_apex(self, amp, model, optimizers, amp_level):
+                    model, optimizers = amp.initialize(
+                        model, optimizers, opt_level=amp_level,
+                    )
+
+                    return model, optimizers
+        """
+        model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)
+        return model, optimizers
+
+    @staticmethod
+    def reinit_scheduler_properties(optimizers: list, schedulers: list):
+        # Reinitialize optimizer.step properties added by schedulers
+        for scheduler in schedulers:
+            scheduler = scheduler['scheduler']
+
+            for optimizer in optimizers:
+                state = None
+                idx = 0
+
+                # check that we dont mix users optimizers and schedulers
+                if scheduler.optimizer == optimizer:
+                    # Find the mro belonging to the base lr scheduler class
+                    for i, mro in enumerate(scheduler.__class__.__mro__):
+                        if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau):
+                            idx = i
+                            state = scheduler.state_dict()
+                        else:
+                            state = None
+
+                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
+                if state is not None:
+                    scheduler.load_state_dict(state)
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/precision/mixed.py b/pytorch_lightning/accelerators/plugins/precision/mixed.py
new file mode 100644
index 0000000000000..1eb1ea18ebc23
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/precision/mixed.py
@@ -0,0 +1,7 @@
+from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
+
+class MixedPrecisionPlugin(PrecisionPlugin):
+    EPSILON = 1e-5
+    backend: AMPType
+    precision = "mixed"
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/precision/native_amp.py b/pytorch_lightning/accelerators/plugins/precision/native_amp.py
new file mode 100644
index 0000000000000..f233a43dfdd53
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/precision/native_amp.py
@@ -0,0 +1,48 @@
+from contextlib import contextmanager
+import torch
+from pytorch_lightning.core import LightningModule
+from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
+
+
+class NativeMixedPrecisionPlugin(MixedPrecisionPlugin):
+    def __init__(self):
+        self.backend = AMPType.NATIVE
+        self.scaler = torch.cuda.amp.GradScaler()
+
+    def pre_optimizer_step(self, optimizer, optimizer_idx):
+        if isinstance(optimizer, torch.optim.LBFGS):
+            raise MisconfigurationException(
+                f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})."
+                " To request, please file a Github issue in PyTorch and tag @mcarilli"
+            )
+
+    def post_optimizer_step(self, optimizer, optimizer_idx):
+        self.scaler.update()
+
+    def backward(
+        self,
+        model: LightningModule,
+        closure_loss: torch.Tensor,
+        optimizer: torch.optim.Optimizer,
+        opt_idx: int,
+        should_accumulate: bool,
+        *args,
+        **kwargs,
+    ):
+        closure_loss = self.scaler.scale(closure_loss)
+
+        automatic_optimization = model.automatic_optimization
+
+        closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
+
+        # unscale gradient to allow analyze within `on_after_backward`
+        if not should_accumulate and automatic_optimization:
+            self.scaler.unscale_(optimizer)
+
+        return closure_loss
+
+    @contextmanager
+    def train_step_context(self):
+        yield torch.cuda.amp.autocast()
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
new file mode 100644
index 0000000000000..048a645de250a
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
@@ -0,0 +1,45 @@
+import torch
+from pytorch_lightning.core import LightningModule
+from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
+
+
+class PrecisionPlugin(Plugin):
+    EPSILON = 1e-6
+    precision = 32
+
+    def pre_optimizer_step(self, optimizer, optimizer_idx):
+        pass
+
+    def post_optimizer_step(self, optimizer, optimizer_idx):
+        pass
+
+    def master_params(self, optimizer):
+        for group in optimizer.param_groups:
+            for p in group["params"]:
+                yield p
+
+    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
+        return model, optimizers, lr_schedulers
+
+    def backward(
+        self,
+        model: LightningModule,
+        closure_loss: torch.Tensor,
+        optimizer: torch.optim.Optimizer,
+        opt_idx: int,
+        should_accumulate: bool,
+        *args,
+        **kwargs,
+    ):
+        automatic_optimization = model.automatic_optimization
+
+        # do backward pass
+        if automatic_optimization:
+            model.backward(closure_loss, optimizer, opt_idx)
+        else:
+            closure_loss.backward(*args, **kwargs)
+
+        # once backward has been applied, release graph
+        closure_loss = closure_loss.detach()
+
+        return closure_loss
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py
deleted file mode 100644
index a2ee98b686bae..0000000000000
--- a/pytorch_lightning/accelerators/precision.py
+++ /dev/null
@@ -1,189 +0,0 @@
-from contextlib import contextmanager
-from pytorch_lightning.accelerators.base_plugin import Plugin
-from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties
-from pytorch_lightning.core.lightning import LightningModule
-from typing import List, Tuple
-import torch
-from torch.optim import Optimizer
-
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities import AMPType, rank_zero_warn
-
-try:
-    from apex import amp
-except ImportError:
-    amp = None
-
-
-class PrecisionPlugin(Plugin):
-    EPSILON = 1e-6
-    precision = 32
-
-    def pre_optimizer_step(self, optimizer, optimizer_idx):
-        pass
-
-    def post_optimizer_step(self, optimizer, optimizer_idx):
-        pass
-
-    def master_params(self, optimizer):
-        for group in optimizer.param_groups:
-            for p in group["params"]:
-                yield p
-
-    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
-        return model, optimizers, lr_schedulers
-
-    def backward(
-        self,
-        model: LightningModule,
-        closure_loss: torch.Tensor,
-        optimizer: torch.optim.Optimizer,
-        opt_idx: int,
-        should_accumulate: bool,
-        *args,
-        **kwargs,
-    ):
-        # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
-        automatic_optimization = model.automatic_optimization
-
-        # do backward pass
-        if automatic_optimization:
-            model.backward(closure_loss, optimizer, opt_idx)
-        else:
-            closure_loss.backward(*args, **kwargs)
-
-        # once backward has been applied, release graph
-        closure_loss = closure_loss.detach()
-
-        return closure_loss
-
-
-class MixedPrecisionPlugin(PrecisionPlugin):
-    EPSILON = 1e-5
-    backend: AMPType
-    precision = "mixed"
-
-
-class NativeMixedPrecisionPlugin(MixedPrecisionPlugin):
-    def __init__(self):
-        self.backend = AMPType.NATIVE
-        self.scaler = torch.cuda.amp.GradScaler()
-
-    def pre_optimizer_step(self, optimizer, optimizer_idx):
-        if isinstance(optimizer, torch.optim.LBFGS):
-            raise MisconfigurationException(
-                f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})."
-                " To request, please file a Github issue in PyTorch and tag @mcarilli"
-            )
-
-    def post_optimizer_step(self, optimizer, optimizer_idx):
-        self.scaler.update()
-
-    def backward(
-        self,
-        model: LightningModule,
-        closure_loss: torch.Tensor,
-        optimizer: torch.optim.Optimizer,
-        opt_idx: int,
-        should_accumulate: bool,
-        *args,
-        **kwargs,
-    ):
-        closure_loss = self.scaler.scale(closure_loss)
-
-        # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317)
-        automatic_optimization = model.automatic_optimization
-
-        closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
-
-        # unscale gradient to allow analyze within `on_after_backward`
-        if not should_accumulate and automatic_optimization:
-            self.scaler.unscale_(optimizer)
-
-        return closure_loss
-
-    @contextmanager
-    def train_step_context(self):
-        yield torch.cuda.amp.autocast()
-
-
-class ApexMixedPrecisionPlugin(MixedPrecisionPlugin):
-    def __init__(self, amp_level):
-        self.backend = AMPType.APEX
-        self.amp_level = amp_level
-
-    def master_params(self, optimizer):
-        return amp.master_params(optimizer)
-
-    def connect(self, model, optimizers, lr_schedulers):
-        model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level)
-        reinit_scheduler_properties(optimizers, lr_schedulers)
-        return model, optimizers, lr_schedulers
-
-    def backward(
-        self,
-        model: LightningModule,
-        closure_loss: torch.Tensor,
-        optimizer: torch.optim.Optimizer,
-        opt_idx: int,
-        should_accumulate: bool,
-        *args,
-        **kwargs,
-    ):
-        closure_loss = amp.scale_loss(closure_loss, optimizer)
-
-        # enter apex context
-        context = closure_loss
-        closure_loss = closure_loss.__enter__()
-
-        # do backward pass
-        # TODO: not entirely sure, why we need this
-        if model is not None and isinstance(model, LightningModule):
-            model.backward(closure_loss, optimizer, opt_idx)
-        else:
-            closure_loss.backward(*args, **kwargs)
-
-        # exit amp context
-        a, b, c = None, None, None
-        error = context.__exit__(a, b, c)
-        if error:
-            rank_zero_warn(a, b, c)
-            raise Exception("apex unscale error")
-
-        # once backward has been applied, release graph
-        closure_loss = closure_loss.detach()
-        return closure_loss
-
-    def configure_apex(
-        self,
-        amp: object,
-        model: LightningModule,
-        optimizers: List[Optimizer],
-        amp_level: str,
-    ) -> Tuple[LightningModule, List[Optimizer]]:
-        r"""
-        Override to init AMP your own way.
-        Must return a model and list of optimizers.
-
-        Args:
-            amp: pointer to amp library object.
-            model: pointer to current :class:`LightningModule`.
-            optimizers: list of optimizers passed in :meth:`configure_optimizers`.
-            amp_level: AMP mode chosen ('O1', 'O2', etc...)
-
-        Return:
-            Apex wrapped model and optimizers
-
-        Examples:
-            .. code-block:: python
-
-                # Default implementation used by Trainer.
-                def configure_apex(self, amp, model, optimizers, amp_level):
-                    model, optimizers = amp.initialize(
-                        model, optimizers, opt_level=amp_level,
-                    )
-
-                    return model, optimizers
-        """
-        model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)
-        return model, optimizers
\ No newline at end of file

From 58536f673aaf1b352babccbfde3fc7cbb5eb9038 Mon Sep 17 00:00:00 2001
From: justusschock <justus.schock@posteo.de>
Date: Wed, 6 Jan 2021 18:15:33 +0100
Subject: [PATCH 099/157] create training_type subpackage

---
 .../accelerators/data_parallel.py             | 843 ------------------
 .../accelerators/plugins/__init__.py          |   3 +
 .../plugins/training_type/__init__.py         |   8 +
 .../accelerators/plugins/training_type/ddp.py | 244 +++++
 .../plugins/training_type/ddp2.py             |   5 +
 .../plugins/training_type/ddp_spawn.py        | 213 +++++
 .../accelerators/plugins/training_type/dp.py  |  44 +
 .../plugins/training_type/horovod.py          | 148 +++
 .../plugins/training_type/parallel.py         |  91 ++
 .../plugins/training_type/single_device.py    |  40 +
 .../training_type/training_type_plugin.py     |  93 ++
 .../accelerators/scheduler_properties.py      |  25 -
 12 files changed, 889 insertions(+), 868 deletions(-)
 delete mode 100644 pytorch_lightning/accelerators/data_parallel.py
 create mode 100644 pytorch_lightning/accelerators/plugins/__init__.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/__init__.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/ddp.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/ddp2.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/dp.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/horovod.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/parallel.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/single_device.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
 delete mode 100644 pytorch_lightning/accelerators/scheduler_properties.py

diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py
deleted file mode 100644
index 02a748222732e..0000000000000
--- a/pytorch_lightning/accelerators/data_parallel.py
+++ /dev/null
@@ -1,843 +0,0 @@
-from abc import ABC, abstractmethod
-import re
-from contextlib import contextmanager, ExitStack
-
-from torch.optim.lr_scheduler import _LRScheduler
-
-from pytorch_lightning.cluster_environments import TorchElasticEnvironment, ClusterEnvironment
-from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.utilities import HOROVOD_AVAILABLE
-from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
-from pytorch_lightning.accelerators.base_plugin import Plugin
-
-from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities.seed import seed_everything
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.distributed.dist import LightningDistributed
-import torch
-import os
-from pytorch_lightning.core.step_result import Result
-from typing import Any, Dict, List, Optional, Union
-from pytorch_lightning.overrides.data_parallel import LightningDataParallel, LightningDistributedDataParallel
-import sys
-from os.path import abspath
-from time import sleep
-import subprocess
-from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only
-import numpy as np
-import torch.distributed as torch_distrib
-from pytorch_lightning import _logger as log
-import torch.multiprocessing as mp
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info
-
-if HOROVOD_AVAILABLE:
-    import horovod.torch as hvd
-
-try:
-    from hydra.utils import to_absolute_path, get_original_cwd
-    from hydra.core.hydra_config import HydraConfig
-except ImportError:
-    HYDRA_AVAILABLE = False
-else:
-    HYDRA_AVAILABLE = True
-
-if torch.distributed.is_available():
-    from torch.distributed import ReduceOp
-else:
-
-    class ReduceOp:
-        SUM = None
-
-
-class TrainingTypePlugin(Plugin, ABC):
-    def __init__(self):
-        self._model = None
-        self._results = None
-        self.global_rank = 0
-
-    @property
-    @abstractmethod
-    def on_gpu(self):
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def root_device(self) -> torch.device:
-        raise NotImplementedError
-
-    @abstractmethod
-    def model_to_device(self):
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def is_global_zero(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def reduce(self, output, *args, **kwargs):
-        raise NotImplementedError
-
-    @abstractmethod
-    def barrier(self, name: Optional[str] = None):
-        raise NotImplementedError
-
-    @abstractmethod
-    def broadcast(self, obj: object, src: int = 0) -> object:
-        raise NotImplementedError
-
-    # TODO method this is currently unused
-    def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids):
-        if device_ids is None:
-            return
-
-        # set the correct cuda visible devices (using pci order)
-        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
-        devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
-        log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')
-
-    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
-        return should_stop
-
-    @property
-    def model(self):
-        return self._model
-
-    @model.setter
-    def model(self, new_model):
-        self._model = new_model
-
-    @property
-    def lightning_module(self):
-        return self._model
-
-    @property
-    def results(self):
-        """
-        The results of the last training/testing run will be cached here.
-        In distributed training, we make sure to transfer the results to the appropriate master process.
-        """
-        # TODO: improve these docs
-        return self._results
-
-    @property
-    def rpc_enabled(self):
-        return False
-
-    def start_training(self, trainer):
-        # double dispatch to initiate the training loop
-        self._results = trainer.train()
-
-    def start_testing(self, trainer):
-        # double dispatch to initiate the test loop
-        self._results = trainer.run_test()
-
-
-class SingleDevicePlugin(TrainingTypePlugin):
-    def __init__(self, device):
-        super().__init__()
-        self.device: torch.device = device
-
-    @property
-    def on_gpu(self):
-        return self.device.type == "cuda" and torch.cuda.is_available()
-
-    def reduce(self, output, *args, **kwargs):
-        return output
-
-    @property
-    def root_device(self):
-        return self.device
-    
-    def model_to_device(self):
-        if self.on_gpu:
-            torch.cuda.set_device(self.root_device)
-
-        self._model.to(self.root_device)
-
-    def connect(self, model: torch.nn.Module):
-        self._model = model
-        self.model_to_device()
-        return self.model
-
-    @property
-    def is_global_zero(self):
-        return True
-
-    def barrier(self, *args, **kwargs):
-        pass
-
-    def broadcast(self, obj: object, src: int = 0) -> object:
-        return obj
-
-
-class ParallelPlugin(TrainingTypePlugin, ABC):
-    def __init__(
-        self,
-        parallel_devices: List[torch.device],
-        cluster_environment: Optional[ClusterEnvironment] = None,
-    ):
-        super().__init__()
-        self.parallel_devices = parallel_devices
-        self.local_rank = 0
-        self.world_size = 1
-        self.cluster_environment = cluster_environment
-
-    @property
-    @abstractmethod
-    def root_device(self):
-        raise NotImplementedError
-
-    @property
-    def on_gpu(self):
-        return self.root_device.type == "cuda" and torch.cuda.is_available()
-
-    @abstractmethod
-    def setup(self, model):
-        raise NotImplementedError
-
-    def connect(self, model, *args, **kwargs):
-        self.setup(model)
-        return self.model
-
-    @property
-    def is_global_zero(self) -> bool:
-        return self.global_rank == 0
-
-    @property
-    def distributed_sampler_kwargs(self):
-        distributed_sampler_kwargs = dict(
-            num_replicas=len(self.parallel_devices),
-            rank=self.global_rank
-        )
-        return distributed_sampler_kwargs
-
-    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
-        should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device)
-        should_stop = self.reduce(should_stop, reduce_op=ReduceOp.SUM)
-        should_stop = bool(should_stop == self.world_size)
-        return should_stop
-
-    @staticmethod
-    def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
-        """
-        Add global batchnorm for a model spread across multiple GPUs and nodes.
-
-        Override to synchronize batchnorm between specific process groups instead
-        of the whole world or use a different sync_bn like `apex`'s version.
-
-        Args:
-            model: pointer to current :class:`LightningModule`.
-
-        Return:
-            LightningModule with batchnorm layers synchronized between process groups
-        """
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-        return model
-
-    @contextmanager
-    def block_backward_sync(self):
-        """
-        Blocks ddp sync gradients behaviour on backwards pass.
-        This is useful for skipping sync when accumulating gradients, reducing communication overhead
-        Returns: context manager with sync behaviour off
-        """
-        if isinstance(self.model, LightningDistributedDataParallel):
-            yield self.model.no_sync()
-        else:
-            yield None
-
-
-class DataParallelPlugin(ParallelPlugin):
-
-    def __init__(self, parallel_devices: List[torch.device]):
-        super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
-
-    def setup(self, model):
-        self._model = LightningDataParallel(model, self.parallel_devices)
-
-    def reduce(self, output, *args, **kwargs):
-        if isinstance(output, Result):
-            output.dp_reduce()
-
-        elif isinstance(output, torch.Tensor):
-            output = output.mean()
-
-        return output
-
-    @property
-    def root_device(self):
-        return self.parallel_devices[0]
-
-    @property
-    def lightning_module(self):
-        return self._model.module
-
-    def model_to_device(self):
-        # no need to do anything when model is wrapped in torch.nn.DataParallel
-        pass
-
-    def barrier(self, *args, **kwargs):
-        pass
-
-    def broadcast(self, obj: object, src: int = 0) -> object:
-        return obj
-
-    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
-        return should_stop
-
-
-class DDPPlugin(ParallelPlugin):
-
-    distributed_backend = "ddp"
-
-    def __init__(
-            self,
-            parallel_devices,
-            num_nodes=1,
-            cluster_environment: ClusterEnvironment = None,
-            sync_batchnorm=False,
-            **kwargs: Dict[str, Any],
-    ) -> None:
-        super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
-        self.interactive_ddp_procs = []
-        self.num_nodes = num_nodes
-        self.sync_batchnorm = sync_batchnorm
-        self.dist = LightningDistributed()
-        self._ddp_kwargs = kwargs
-        self._has_spawned_children = False
-        self.task_idx = None
-        self.node_rank = 0
-        self.num_processes = len(parallel_devices)
-
-    @property
-    def root_device(self):
-        return self.parallel_devices[self.local_rank]
-
-    @property
-    def lightning_module(self):
-        # the model may not be wrapped with DistributedDataParallel if calling this too early
-        return getattr(self._model, "module", self._model)
-
-    @property
-    def distributed_sampler_kwargs(self):
-        distributed_sampler_kwargs = dict(
-            num_replicas=(self.num_nodes * self.num_processes),
-            rank=self.global_rank
-        )
-        return distributed_sampler_kwargs
-
-    def setup(self, model):
-        self._model = model
-
-        # start the other scripts
-        # TODO: make sure this works, in torchelastic we should not launch child processes!
-        if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
-            self._call_children_scripts()
-
-        # set the task idx
-        self.task_idx = self.cluster_environment.local_rank()
-
-    def _call_children_scripts(self):
-
-        # bookkeeping of spawned processes
-        assert self.global_rank == 0
-        self._check_can_spawn_children()
-        self._has_spawned_children = True
-
-        # DDP Environment variables
-        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
-        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
-
-        # allow the user to pass the node rank
-        node_rank = "0"
-        node_rank = os.environ.get("NODE_RANK", node_rank)
-        node_rank = os.environ.get("GROUP_RANK", node_rank)
-        os.environ["NODE_RANK"] = node_rank
-        os.environ["LOCAL_RANK"] = "0"
-
-        # when user is using hydra find the absolute path
-        path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path
-
-        # pull out the commands used to run the script and resolve the abs file path
-        command = sys.argv
-        try:
-            full_path = path_lib(command[0])
-        except Exception as e:
-            full_path = abspath(command[0])
-
-        command[0] = full_path
-        # use the same python interpreter and actually running
-        command = [sys.executable] + command
-
-        # the visible devices tell us how many GPUs we want to use.
-        # when the trainer script was called the device has already been scoped by the time
-        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
-        # but forward the GPUs selected via environment variables
-        if self.parallel_devices is None:
-            raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")
-
-        os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices])
-        os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
-
-        if self.lightning_module.logger is not None:
-            os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version)
-
-        num_gpus = len(self.parallel_devices)
-        os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
-
-        self.interactive_ddp_procs = []
-
-        for local_rank in range(1, self.num_processes):
-            env_copy = os.environ.copy()
-            env_copy["LOCAL_RANK"] = f"{local_rank}"
-
-            # remove env var if global seed not set
-            if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
-                del env_copy["PL_GLOBAL_SEED"]
-
-            # start process
-            # if hydra is available and initialized, make sure to set the cwd correctly
-            cwd: Optional[str] = None
-            if HYDRA_AVAILABLE:
-                if HydraConfig.initialized():
-                    cwd = get_original_cwd()
-            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
-            self.interactive_ddp_procs.append(proc)
-
-            # starting all processes at once can cause issues
-            # with dataloaders delay between 1-10 seconds
-            delay = np.random.uniform(1, 5, 1)[0]
-            sleep(delay)
-
-    def _check_can_spawn_children(self):
-        if self._has_spawned_children:
-            raise RuntimeError(
-                "You tried to run `.fit` or `.test` multiple times in the same script."
-                " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead."
-            )
-
-    def set_world_ranks(self):
-        self.local_rank = self.task_idx
-        self.node_rank = self.cluster_environment.node_rank()
-        self.global_rank = self.node_rank * self.num_processes + self.local_rank
-        self.world_size = self.num_nodes * self.num_processes
-
-    def configure_ddp(self):
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self._model = LightningDistributedDataParallel(
-            self.model,
-            device_ids=self.determine_ddp_device_ids(),
-            **self._ddp_kwargs,
-        )
-
-    def determine_ddp_device_ids(self):
-        if self.root_device.type == "cpu":
-            return None
-        return [self.root_device.index]
-
-    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
-        # TODO: From where to get cluster environment?
-        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
-        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
-        torch_backend = "nccl" if self.on_gpu else "gloo"
-
-        if not torch.distributed.is_initialized():
-            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
-
-    def pre_training(self):
-        # TODO: check if needed
-        seed = os.environ.get("PL_GLOBAL_SEED")
-        if seed is not None:
-            seed_everything(int(seed))
-
-        # determine which process we are and world size
-        self.set_world_ranks()
-
-        # set warning rank
-        rank_zero_only.rank = self.global_rank
-
-        # set up server using proc 0's ip address
-        # try to init for 20 times at max in case ports are taken
-        # where to store ip_table
-        self.init_ddp_connection(self.global_rank, self.world_size)
-
-        # TODO: we moved it to the trainer.fit after calling pre_training
-        #   ... need to double check that it is the correct place
-        # self.trainer.call_setup_hook(self.model)
-
-        # on world_size=0 let everyone know training is starting
-        if self.is_global_zero and not torch.distributed.is_initialized():
-            log.info("-" * 100)
-            log.info(f"distributed_backend={self.distributed_backend}")
-            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
-            log.info("-" * 100)
-
-        # set the ranks and devices
-        self.dist.rank = self.global_rank
-        self.dist.device = self.root_device
-
-        if self.sync_batchnorm:
-            self.model = self.configure_sync_batchnorm(self.model)
-
-        # move the model to the correct device
-        self.model_to_device()
-
-        self.configure_ddp()
-
-        self.barrier()
-
-    def post_training(self):
-        if "WORLD_SIZE" in os.environ:
-            del os.environ["WORLD_SIZE"]
-
-    def barrier(self, *args, **kwargs):
-        if torch_distrib.is_initialized():
-            torch_distrib.barrier()
-
-    def broadcast(self, obj: object, src: int = 0) -> object:
-        return self.dist.broadcast(obj)
-
-    def model_to_device(self):
-        if self.root_device.type == "cuda":
-            torch.cuda.set_device(self.root_device)
-        self.model.to(self.root_device)
-
-    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
-        if isinstance(output, torch.Tensor):
-            output = sync_ddp_if_available(output, group, reduce_op)
-        return output
-
-
-class DDPSpawnPlugin(ParallelPlugin):
-
-    distributed_backend = "ddp_spawn"
-
-    def __init__(
-        self,
-        parallel_devices,
-        num_nodes=1,
-        cluster_environment: ClusterEnvironment = None,
-        sync_batchnorm=False,
-        **kwargs: Dict[str, Any]
-    ):
-        super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
-        self.num_nodes = num_nodes
-        self.sync_batchnorm = sync_batchnorm
-        self._ddp_kwargs = kwargs
-        self.dist = LightningDistributed()
-        self.num_processes = len(parallel_devices)
-        self.node_rank = 0
-        self.mp_queue = None
-
-    @property
-    def root_device(self):
-        return self.parallel_devices[self.local_rank]
-
-    @property
-    def lightning_module(self):
-        # the model may not be wrapped with DistributedDataParallel if calling this too early
-        return getattr(self._model, "module", self._model)
-
-    @property
-    def distributed_sampler_kwargs(self):
-        distributed_sampler_kwargs = dict(
-            num_replicas=(self.num_nodes * self.num_processes),
-            rank=self.global_rank
-        )
-        return distributed_sampler_kwargs
-
-    def setup(self, model):
-        self._model = model
-
-        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
-
-        # pass in a state q
-        smp = mp.get_context('spawn')
-        self.mp_queue = smp.SimpleQueue()
-
-    def set_world_ranks(self, process_idx):
-        self.local_rank = process_idx
-        self.node_rank = self.cluster_environment.node_rank()
-        self.global_rank = self.node_rank * self.num_processes + self.local_rank
-        self.world_size = self.num_nodes * self.num_processes
-
-    def start_training(self, trainer):
-        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,))
-
-    def start_testing(self, trainer):
-        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer, ))
-
-    def new_process(self, process_idx, trainer):
-        # TODO: check if needed
-        seed = os.environ.get("PL_GLOBAL_SEED")
-        if seed is not None:
-            seed_everything(int(seed))
-
-        self.set_world_ranks(process_idx)
-
-        # set warning rank
-        rank_zero_only.rank = self.global_rank
-
-        # set up server using proc 0's ip address
-        # try to init for 20 times at max in case ports are taken
-        # where to store ip_table
-        self.init_ddp_connection(self.global_rank, self.world_size)
-
-        # TODO: we moved it to the trainer.fit after calling pre_training
-        #   ... need to double check that it is the correct place
-        # self.trainer.call_setup_hook(self.model)
-
-        # on world_size=0 let everyone know training is starting
-        if self.is_global_zero and not torch.distributed.is_initialized():
-            log.info("-" * 100)
-            log.info(f"distributed_backend={self.distributed_backend}")
-            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
-            log.info("-" * 100)
-
-        # set the ranks and devices
-        self.dist.rank = self.global_rank
-        self.dist.device = self.root_device
-
-        if self.sync_batchnorm:
-            self.model = self.configure_sync_batchnorm(self.model)
-
-        # move the model to the correct device
-        self.model_to_device()
-
-        self.configure_ddp()
-
-        self.barrier()
-
-        if trainer.testing:
-            results = trainer.run_test()
-        else:
-            results = trainer.train()
-
-        # persist info in ddp_spawn
-        self.transfer_distrib_spawn_state_on_fit_end(results)
-
-    def post_training(self):
-        # restore main state with best weights
-        best_path = self.mp_queue.get()
-        last_path = self.mp_queue.get()
-        self._results = self.mp_queue.get()
-
-        # recover the weights of the processes trained in the children
-        self.__recover_child_process_weights(best_path, last_path)
-
-    def configure_ddp(self):
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self.model = LightningDistributedDataParallel(
-            self.model,
-            device_ids=self.determine_ddp_device_ids(),
-            **self._ddp_kwargs,
-        )
-
-    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
-        # TODO: this code is duplicated in DDP and DDPSpawn, make this a function
-        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
-        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
-        torch_backend = "nccl" if self.on_gpu else "gloo"
-
-        if not torch.distributed.is_initialized():
-            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
-
-    def determine_ddp_device_ids(self):
-        if self.root_device.type == "cpu":
-            return None
-        return [self.root_device.index]
-
-    def transfer_distrib_spawn_state_on_fit_end(self, results):
-        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
-        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path
-
-        if self.global_rank == 0 and self.mp_queue is not None:
-            rank_zero_warn('cleaning up ddp environment...')
-
-            # save the last weights
-            last_path = None
-            # TODO: is there a better way than accessing trainer through model -> trainer?
-            if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
-                last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
-                atomic_save(self.lightning_module.state_dict(), last_path)
-
-            # todo, pass complete checkpoint as state dictionary
-            self.mp_queue.put(best_model_path)
-            self.mp_queue.put(last_path)
-            self.mp_queue.put(results)
-
-    def __recover_child_process_weights(self, best_path, last_path):
-        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
-        # transfer back the best path to the trainer
-        if self.lightning_module.trainer.checkpoint_callback:
-            self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path
-        # todo, pass also best score
-
-        # load last weights
-        if last_path is not None and not self.lightning_module.trainer.testing:
-            ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
-            self.lightning_module.load_state_dict(ckpt)
-
-    def barrier(self, *args, **kwargs):
-        if torch_distrib.is_initialized():
-            torch_distrib.barrier()
-
-    def broadcast(self, obj: object, src: int = 0) -> object:
-        return self.dist.broadcast(obj)
-
-    def model_to_device(self):
-        if self.root_device.type == "cuda":
-            torch.cuda.set_device(self.root_device)
-        self.model.to(self.root_device)
-
-    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
-        if isinstance(output, torch.Tensor):
-            output = sync_ddp_if_available(output, group, reduce_op)
-        return output
-
-
-# TODO: DDP2
-class DDP2Plugin(DDPPlugin):
-    pass
-
-
-class HorovodPlugin(ParallelPlugin):
-
-    def __init__(self, parallel_devices: List[torch.device]):
-        super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
-
-    @property
-    def root_device(self):
-        return self.parallel_devices[self.local_rank]
-
-    @property
-    def distributed_sampler_kwargs(self):
-        distributed_sampler_kwargs = dict(
-            num_replicas=hvd.size(),
-            rank=hvd.rank()
-        )
-        return distributed_sampler_kwargs
-
-    def setup(self, model):
-        self._model = model
-
-        self.global_rank = hvd.rank()
-        self.local_rank = hvd.local_rank()
-        rank_zero_only.rank = self.global_rank
-
-        self.model_to_device()
-
-    def pre_training(self):
-
-        def _unpack_lightning_optimizer(opt):
-            return opt._optimizer if isinstance(opt, LightningOptimizer) else opt
-
-        optimizers = self.lightning_module.trainer.optimizers
-        optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers]
-
-        # Horovod: scale the learning rate by the number of workers to account for
-        # increased total batch size
-        for optimizer in optimizers:
-            for param_group in optimizer.param_groups:
-                param_group['lr'] *= hvd.size()
-
-        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
-        lr_schedulers = self.lightning_module.trainer.lr_schedulers
-        for scheduler in lr_schedulers:
-            scheduler = scheduler['scheduler']
-            if isinstance(scheduler, _LRScheduler):
-                scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs]
-
-        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
-        hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0)
-        for optimizer in optimizers:
-            hvd.broadcast_optimizer_state(optimizer, root_rank=0)
-
-        def _filter_named_parameters(model, optimizer):
-            opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])])
-            return [(name, p) for name, p in model.named_parameters() if p in opt_params]
-
-        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
-        optimizers = [
-            hvd.DistributedOptimizer(optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer))
-            for optimizer in optimizers
-        ]
-
-        optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers)
-        self.lightning_module.trainer.optimizers = optimizers
-
-    def start_training(self, trainer):
-        with ExitStack() as stack:
-            for optimizer in trainer.optimizers:
-                # Synchronization will be performed explicitly following backward()
-                stack.enter_context(optimizer.skip_synchronize())
-
-            # set up training routine
-            self._results = trainer.train()
-
-        # Make sure all workers have finished training before returning to the user
-        hvd.join()
-
-    def start_testing(self, trainer):
-        with ExitStack() as stack:
-            # set up training routine
-            # self.trainer.train_loop.setup_training(self.trainer.model)
-            self._results = trainer.run_test()
-
-        # Make sure all workers have finished training before returning to the user
-        hvd.join()
-
-    def barrier(self, *args, **kwargs):
-        hvd.join()
-
-    def broadcast(self, obj: object, src: int = 0) -> object:
-        obj = hvd.broadcast_object(obj, src)
-        return obj
-
-    def model_to_device(self):
-        if self.on_gpu:
-            torch.cuda.set_device(self.root_device)
-        self.model.to(self.root_device)
-
-    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
-        if group is not None:
-            raise ValueError(
-                "Horovod does not support allreduce using a subcommunicator at this time. "
-                "Unset `group`."
-            )
-
-        if reduce_op is None or reduce_op == "sum":
-            reduce_op = hvd.Sum
-        elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
-            reduce_op = hvd.Average
-        else:
-            raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")
-
-        # sync all processes before reduction
-        hvd.join()
-        return hvd.allreduce(output, op=reduce_op)
-
-    def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None):
-        if group is not None:
-            raise ValueError(
-                "Horovod does not support allgather using a subcommunicator at this time. "
-                "Unset `group`."
-            )
-
-        if len(result.shape) == 0:
-            # Convert scalars to single dimension tensors
-            result = result.reshape(1)
-
-        # sync and gather all
-        hvd.join()
-        gathered = hvd.allgather(result)
-        gathered_result = list(gathered.split(1, dim=0))
-        return gathered_result
diff --git a/pytorch_lightning/accelerators/plugins/__init__.py b/pytorch_lightning/accelerators/plugins/__init__.py
new file mode 100644
index 0000000000000..119284ef33c76
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/__init__.py
@@ -0,0 +1,3 @@
+from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
+from pytorch_lightning.accelerators.plugins.precision import *
+from pytorch_lightning.accelerators.plugins.training_type import *
diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
new file mode 100644
index 0000000000000..532ea418a40bd
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
@@ -0,0 +1,8 @@
+from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin
+from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
+from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin
+from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin
+from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin
+from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
new file mode 100644
index 0000000000000..ec275f227016a
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
@@ -0,0 +1,244 @@
+import os
+import sys
+import subprocess
+from time import sleep
+import numpy as np
+from typing import Any, Dict, Optional
+
+import torch
+import torch.distributed as torch_distrib
+
+from pytorch_lightning import _logger as log
+from pytorch_lightning.utilities import HYDRA_AVAILABLE
+from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only, sync_ddp_if_available
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.seed import seed_everything
+
+if HYDRA_AVAILABLE:
+    from hydra.utils import to_absolute_path, get_original_cwd
+    from hydra.core.hydra_config import HydraConfig
+
+
+class DDPPlugin(ParallelPlugin):
+
+    distributed_backend = "ddp"
+
+    def __init__(
+        self,
+        parallel_devices,
+        num_nodes=1,
+        cluster_environment: ClusterEnvironment = None,
+        sync_batchnorm=False,
+        **kwargs: Dict[str, Any],
+    ) -> None:
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
+        self.interactive_ddp_procs = []
+        self.num_nodes = num_nodes
+        self.sync_batchnorm = sync_batchnorm
+        self.dist = LightningDistributedDataParallel()
+        self._ddp_kwargs = kwargs
+        self._has_spawned_children = False
+        self.task_idx = None
+        self.node_rank = 0
+        self.num_processes = len(parallel_devices)
+
+    @property
+    def root_device(self):
+        return self.parallel_devices[self.local_rank]
+
+    @property
+    def lightning_module(self):
+        # the model may not be wrapped with DistributedDataParallel if calling this too early
+        return getattr(self._model, "module", self._model)
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
+        return distributed_sampler_kwargs
+
+    def setup(self, model):
+        self._model = model
+
+        # start the other scripts
+        # TODO: make sure this works, in torchelastic we should not launch child processes!
+        if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
+            self._call_children_scripts()
+
+        # set the task idx
+        self.task_idx = self.cluster_environment.local_rank()
+
+    def _call_children_scripts(self):
+
+        # bookkeeping of spawned processes
+        assert self.global_rank == 0
+        self._check_can_spawn_children()
+        self._has_spawned_children = True
+
+        # DDP Environment variables
+        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
+        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+
+        # allow the user to pass the node rank
+        node_rank = "0"
+        node_rank = os.environ.get("NODE_RANK", node_rank)
+        node_rank = os.environ.get("GROUP_RANK", node_rank)
+        os.environ["NODE_RANK"] = node_rank
+        os.environ["LOCAL_RANK"] = "0"
+
+        # when user is using hydra find the absolute path
+        path_lib = os.path.abspath if not HYDRA_AVAILABLE else to_absolute_path
+
+        # pull out the commands used to run the script and resolve the abs file path
+        command = sys.argv
+        try:
+            full_path = path_lib(command[0])
+        except Exception as e:
+            full_path = os.path.abspath(command[0])
+
+        command[0] = full_path
+        # use the same python interpreter and actually running
+        command = [sys.executable] + command
+
+        # the visible devices tell us how many GPUs we want to use.
+        # when the trainer script was called the device has already been scoped by the time
+        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
+        # but forward the GPUs selected via environment variables
+        if self.parallel_devices is None:
+            raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")
+
+        os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices])
+        os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
+
+        if self.lightning_module.logger is not None:
+            os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version)
+
+        num_gpus = len(self.parallel_devices)
+        os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}"
+
+        self.interactive_ddp_procs = []
+
+        for local_rank in range(1, self.num_processes):
+            env_copy = os.environ.copy()
+            env_copy["LOCAL_RANK"] = f"{local_rank}"
+
+            # remove env var if global seed not set
+            if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
+                del env_copy["PL_GLOBAL_SEED"]
+
+            # start process
+            # if hydra is available and initialized, make sure to set the cwd correctly
+            cwd: Optional[str] = None
+            if HYDRA_AVAILABLE:
+                if HydraConfig.initialized():
+                    cwd = get_original_cwd()
+            proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
+            self.interactive_ddp_procs.append(proc)
+
+            # starting all processes at once can cause issues
+            # with dataloaders delay between 1-10 seconds
+            delay = np.random.uniform(1, 5, 1)[0]
+            sleep(delay)
+
+    def _check_can_spawn_children(self):
+        if self._has_spawned_children:
+            raise RuntimeError(
+                "You tried to run `.fit` or `.test` multiple times in the same script."
+                " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead."
+            )
+
+    def set_world_ranks(self):
+        self.local_rank = self.task_idx
+        self.node_rank = self.cluster_environment.node_rank()
+        self.global_rank = self.node_rank * self.num_processes + self.local_rank
+        self.world_size = self.num_nodes * self.num_processes
+
+    def configure_ddp(self):
+        # if unset, default `find_unused_parameters` `True`
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
+        self._model = LightningDistributedDataParallel(
+            self.model,
+            device_ids=self.determine_ddp_device_ids(),
+            **self._ddp_kwargs,
+        )
+
+    def determine_ddp_device_ids(self):
+        if self.root_device.type == "cpu":
+            return None
+        return [self.root_device.index]
+
+    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
+        # TODO: From where to get cluster environment?
+        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
+        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
+        torch_backend = "nccl" if self.on_gpu else "gloo"
+
+        if not torch.distributed.is_initialized():
+            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
+            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
+
+    def pre_training(self):
+        # TODO: check if needed
+        seed = os.environ.get("PL_GLOBAL_SEED")
+        if seed is not None:
+            seed_everything(int(seed))
+
+        # determine which process we are and world size
+        self.set_world_ranks()
+
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        self.init_ddp_connection(self.global_rank, self.world_size)
+
+        # TODO: we moved it to the trainer.fit after calling pre_training
+        #   ... need to double check that it is the correct place
+        # self.trainer.call_setup_hook(self.model)
+
+        # on world_size=0 let everyone know training is starting
+        if self.is_global_zero and not torch.distributed.is_initialized():
+            log.info("-" * 100)
+            log.info(f"distributed_backend={self.distributed_backend}")
+            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
+            log.info("-" * 100)
+
+        # set the ranks and devices
+        self.dist.rank = self.global_rank
+        self.dist.device = self.root_device
+
+        if self.sync_batchnorm:
+            self.model = self.configure_sync_batchnorm(self.model)
+
+        # move the model to the correct device
+        self.model_to_device()
+
+        self.configure_ddp()
+
+        self.barrier()
+
+    def post_training(self):
+        if "WORLD_SIZE" in os.environ:
+            del os.environ["WORLD_SIZE"]
+
+    def barrier(self, *args, **kwargs):
+        if torch_distrib.is_initialized():
+            torch_distrib.barrier()
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return self.dist.broadcast(obj)
+
+    def model_to_device(self):
+        if self.root_device.type == "cuda":
+            torch.cuda.set_device(self.root_device)
+        self.model.to(self.root_device)
+
+    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
+        if isinstance(output, torch.Tensor):
+            output = sync_ddp_if_available(output, group, reduce_op)
+        return output
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py
new file mode 100644
index 0000000000000..078dfe6cd6ec1
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py
@@ -0,0 +1,5 @@
+from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
+
+# TODO: DDP2
+class DDP2Plugin(DDPPlugin):
+    pass
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
new file mode 100644
index 0000000000000..e2c61bfe6e3fd
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
@@ -0,0 +1,213 @@
+import re
+import os
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from typing import Any, Dict, Optional, Union
+import torch
+
+import torch.multiprocessing as mp
+import torch.distributed as torch_distrib
+
+from pytorch_lightning.distributed.dist import LightningDistributed
+from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
+from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only
+from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn
+from pytorch_lightning.utilities.seed import seed_everything
+
+from pytorch_lightning import _logger as log
+
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+else:
+
+    class ReduceOp:
+        SUM = None
+
+
+class DDPSpawnPlugin(ParallelPlugin):
+
+    distributed_backend = "ddp_spawn"
+
+    def __init__(
+        self,
+        parallel_devices,
+        num_nodes=1,
+        cluster_environment: ClusterEnvironment = None,
+        sync_batchnorm=False,
+        **kwargs: Dict[str, Any],
+    ):
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
+        self.num_nodes = num_nodes
+        self.sync_batchnorm = sync_batchnorm
+        self._ddp_kwargs = kwargs
+        self.dist = LightningDistributed()
+        self.num_processes = len(parallel_devices)
+        self.node_rank = 0
+        self.mp_queue = None
+
+    @property
+    def root_device(self):
+        return self.parallel_devices[self.local_rank]
+
+    @property
+    def lightning_module(self):
+        # the model may not be wrapped with DistributedDataParallel if calling this too early
+        return getattr(self._model, "module", self._model)
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
+        return distributed_sampler_kwargs
+
+    def setup(self, model):
+        self._model = model
+
+        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+
+        # pass in a state q
+        smp = mp.get_context("spawn")
+        self.mp_queue = smp.SimpleQueue()
+
+    def set_world_ranks(self, process_idx):
+        self.local_rank = process_idx
+        self.node_rank = self.cluster_environment.node_rank()
+        self.global_rank = self.node_rank * self.num_processes + self.local_rank
+        self.world_size = self.num_nodes * self.num_processes
+
+    def start_training(self, trainer):
+        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,))
+
+    def start_testing(self, trainer):
+        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,))
+
+    def new_process(self, process_idx, trainer):
+        # TODO: check if needed
+        seed = os.environ.get("PL_GLOBAL_SEED")
+        if seed is not None:
+            seed_everything(int(seed))
+
+        self.set_world_ranks(process_idx)
+
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        self.init_ddp_connection(self.global_rank, self.world_size)
+
+        # TODO: we moved it to the trainer.fit after calling pre_training
+        #   ... need to double check that it is the correct place
+        # self.trainer.call_setup_hook(self.model)
+
+        # on world_size=0 let everyone know training is starting
+        if self.is_global_zero and not torch.distributed.is_initialized():
+            log.info("-" * 100)
+            log.info(f"distributed_backend={self.distributed_backend}")
+            log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
+            log.info("-" * 100)
+
+        # set the ranks and devices
+        self.dist.rank = self.global_rank
+        self.dist.device = self.root_device
+
+        if self.sync_batchnorm:
+            self.model = self.configure_sync_batchnorm(self.model)
+
+        # move the model to the correct device
+        self.model_to_device()
+
+        self.configure_ddp()
+
+        self.barrier()
+
+        if trainer.testing:
+            results = trainer.run_test()
+        else:
+            results = trainer.train()
+
+        # persist info in ddp_spawn
+        self.transfer_distrib_spawn_state_on_fit_end(results)
+
+    def post_training(self):
+        # restore main state with best weights
+        best_path = self.mp_queue.get()
+        last_path = self.mp_queue.get()
+        self._results = self.mp_queue.get()
+
+        # recover the weights of the processes trained in the children
+        self.__recover_child_process_weights(best_path, last_path)
+
+    def configure_ddp(self):
+        # if unset, default `find_unused_parameters` `True`
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
+        self.model = LightningDistributedDataParallel(
+            self.model,
+            device_ids=self.determine_ddp_device_ids(),
+            **self._ddp_kwargs,
+        )
+
+    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
+        # TODO: this code is duplicated in DDP and DDPSpawn, make this a function
+        os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
+        os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
+        torch_backend = "nccl" if self.on_gpu else "gloo"
+
+        if not torch.distributed.is_initialized():
+            log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
+            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
+
+    def determine_ddp_device_ids(self):
+        if self.root_device.type == "cpu":
+            return None
+        return [self.root_device.index]
+
+    def transfer_distrib_spawn_state_on_fit_end(self, results):
+        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
+        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path
+
+        if self.global_rank == 0 and self.mp_queue is not None:
+            rank_zero_warn("cleaning up ddp environment...")
+
+            # save the last weights
+            last_path = None
+            # TODO: is there a better way than accessing trainer through model -> trainer?
+            if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+                last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path)
+                atomic_save(self.lightning_module.state_dict(), last_path)
+
+            # todo, pass complete checkpoint as state dictionary
+            self.mp_queue.put(best_model_path)
+            self.mp_queue.put(last_path)
+            self.mp_queue.put(results)
+
+    def __recover_child_process_weights(self, best_path, last_path):
+        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
+        # transfer back the best path to the trainer
+        if self.lightning_module.trainer.checkpoint_callback:
+            self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path
+        # todo, pass also best score
+
+        # load last weights
+        if last_path is not None and not self.lightning_module.trainer.testing:
+            ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
+            self.lightning_module.load_state_dict(ckpt)
+
+    def barrier(self, *args, **kwargs):
+        if torch_distrib.is_initialized():
+            torch_distrib.barrier()
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return self.dist.broadcast(obj)
+
+    def model_to_device(self):
+        if self.root_device.type == "cuda":
+            torch.cuda.set_device(self.root_device)
+        self.model.to(self.root_device)
+
+    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
+        if isinstance(output, torch.Tensor):
+            output = sync_ddp_if_available(output, group, reduce_op)
+        return output
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/training_type/dp.py b/pytorch_lightning/accelerators/plugins/training_type/dp.py
new file mode 100644
index 0000000000000..0c50d077633af
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/dp.py
@@ -0,0 +1,44 @@
+from typing import List
+
+import torch
+from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.overrides.data_parallel import LightningDataParallel
+
+class DataParallelPlugin(ParallelPlugin):
+
+    def __init__(self, parallel_devices: List[torch.device]):
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
+
+    def setup(self, model):
+        self._model = LightningDataParallel(model, self.parallel_devices)
+
+    def reduce(self, output, *args, **kwargs):
+        if isinstance(output, Result):
+            output.dp_reduce()
+
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
+
+        return output
+
+    @property
+    def root_device(self):
+        return self.parallel_devices[0]
+
+    @property
+    def lightning_module(self):
+        return self._model.module
+
+    def model_to_device(self):
+        # no need to do anything when model is wrapped in torch.nn.DataParallel
+        pass
+
+    def barrier(self, *args, **kwargs):
+        pass
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return obj
+
+    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
+        return should_stop
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/accelerators/plugins/training_type/horovod.py
new file mode 100644
index 0000000000000..72e14c1a6a790
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/horovod.py
@@ -0,0 +1,148 @@
+from contextlib import ExitStack
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from typing import Any, List, Optional, Union
+
+import torch
+from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.utilities import HOROVOD_AVAILABLE
+from pytorch_lightning.core.optimizer import LightningOptimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+if HOROVOD_AVAILABLE:
+    import horovod.torch as hvd
+
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+else:
+
+    class ReduceOp:
+        SUM = None
+
+
+class HorovodPlugin(ParallelPlugin):
+    def __init__(self, parallel_devices: List[torch.device]):
+        super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
+
+    @property
+    def root_device(self):
+        return self.parallel_devices[self.local_rank]
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(num_replicas=hvd.size(), rank=hvd.rank())
+        return distributed_sampler_kwargs
+
+    def setup(self, model):
+        self._model = model
+
+        self.global_rank = hvd.rank()
+        self.local_rank = hvd.local_rank()
+        rank_zero_only.rank = self.global_rank
+
+        self.model_to_device()
+
+    def pre_training(self):
+        def _unpack_lightning_optimizer(opt):
+            return opt._optimizer if isinstance(opt, LightningOptimizer) else opt
+
+        optimizers = self.lightning_module.trainer.optimizers
+        optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers]
+
+        # Horovod: scale the learning rate by the number of workers to account for
+        # increased total batch size
+        for optimizer in optimizers:
+            for param_group in optimizer.param_groups:
+                param_group["lr"] *= hvd.size()
+
+        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
+        lr_schedulers = self.lightning_module.trainer.lr_schedulers
+        for scheduler in lr_schedulers:
+            scheduler = scheduler["scheduler"]
+            if isinstance(scheduler, _LRScheduler):
+                scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs]
+
+        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
+        hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0)
+        for optimizer in optimizers:
+            hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+
+        def _filter_named_parameters(model, optimizer):
+            opt_params = set([p for group in optimizer.param_groups for p in group.get("params", [])])
+            return [(name, p) for name, p in model.named_parameters() if p in opt_params]
+
+        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
+        optimizers = [
+            hvd.DistributedOptimizer(
+                optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer)
+            )
+            for optimizer in optimizers
+        ]
+
+        optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers)
+        self.lightning_module.trainer.optimizers = optimizers
+
+    def start_training(self, trainer):
+        with ExitStack() as stack:
+            for optimizer in trainer.optimizers:
+                # Synchronization will be performed explicitly following backward()
+                stack.enter_context(optimizer.skip_synchronize())
+
+            # set up training routine
+            self._results = trainer.train()
+
+        # Make sure all workers have finished training before returning to the user
+        hvd.join()
+
+    def start_testing(self, trainer):
+        with ExitStack() as stack:
+            # set up training routine
+            # self.trainer.train_loop.setup_training(self.trainer.model)
+            self._results = trainer.run_test()
+
+        # Make sure all workers have finished training before returning to the user
+        hvd.join()
+
+    def barrier(self, *args, **kwargs):
+        hvd.join()
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        obj = hvd.broadcast_object(obj, src)
+        return obj
+
+    def model_to_device(self):
+        if self.on_gpu:
+            torch.cuda.set_device(self.root_device)
+        self.model.to(self.root_device)
+
+    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
+        if group is not None:
+            raise ValueError(
+                "Horovod does not support allreduce using a subcommunicator at this time. " "Unset `group`."
+            )
+
+        if reduce_op is None or reduce_op == "sum":
+            reduce_op = hvd.Sum
+        elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
+            reduce_op = hvd.Average
+        else:
+            raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")
+
+        # sync all processes before reduction
+        hvd.join()
+        return hvd.allreduce(output, op=reduce_op)
+
+    def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None):
+        if group is not None:
+            raise ValueError(
+                "Horovod does not support allgather using a subcommunicator at this time. " "Unset `group`."
+            )
+
+        if len(result.shape) == 0:
+            # Convert scalars to single dimension tensors
+            result = result.reshape(1)
+
+        # sync and gather all
+        hvd.join()
+        gathered = hvd.allgather(result)
+        gathered_result = list(gathered.split(1, dim=0))
+        return gathered_result
diff --git a/pytorch_lightning/accelerators/plugins/training_type/parallel.py b/pytorch_lightning/accelerators/plugins/training_type/parallel.py
new file mode 100644
index 0000000000000..fd366f677b55f
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/parallel.py
@@ -0,0 +1,91 @@
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from typing import List, Optional
+import torch
+from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
+from pytorch_lightning.cluster_environments import ClusterEnvironment
+from pytorch_lightning.core import LightningModule
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+else:
+
+    class ReduceOp:
+        SUM = None
+
+class ParallelPlugin(TrainingTypePlugin, ABC):
+    def __init__(
+        self,
+        parallel_devices: List[torch.device],
+        cluster_environment: Optional[ClusterEnvironment] = None,
+    ):
+        super().__init__()
+        self.parallel_devices = parallel_devices
+        self.local_rank = 0
+        self.world_size = 1
+        self.cluster_environment = cluster_environment
+
+    @property
+    @abstractmethod
+    def root_device(self):
+        raise NotImplementedError
+
+    @property
+    def on_gpu(self):
+        return self.root_device.type == "cuda" and torch.cuda.is_available()
+
+    @abstractmethod
+    def setup(self, model):
+        raise NotImplementedError
+
+    def connect(self, model, *args, **kwargs):
+        self.setup(model)
+        return self.model
+
+    @property
+    def is_global_zero(self) -> bool:
+        return self.global_rank == 0
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(
+            num_replicas=len(self.parallel_devices),
+            rank=self.global_rank
+        )
+        return distributed_sampler_kwargs
+
+    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
+        should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device)
+        should_stop = self.reduce(should_stop, reduce_op=ReduceOp.SUM)
+        should_stop = bool(should_stop == self.world_size)
+        return should_stop
+
+    @staticmethod
+    def configure_sync_batchnorm(model: LightningModule) -> LightningModule:
+        """
+        Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override to synchronize batchnorm between specific process groups instead
+        of the whole world or use a different sync_bn like `apex`'s version.
+
+        Args:
+            model: pointer to current :class:`LightningModule`.
+
+        Return:
+            LightningModule with batchnorm layers synchronized between process groups
+        """
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        return model
+
+    @contextmanager
+    def block_backward_sync(self):
+        """
+        Blocks ddp sync gradients behaviour on backwards pass.
+        This is useful for skipping sync when accumulating gradients, reducing communication overhead
+        Returns: context manager with sync behaviour off
+        """
+        if isinstance(self.model, LightningDistributedDataParallel):
+            yield self.model.no_sync()
+        else:
+            yield None
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/training_type/single_device.py b/pytorch_lightning/accelerators/plugins/training_type/single_device.py
new file mode 100644
index 0000000000000..2e674ef87fbb4
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/single_device.py
@@ -0,0 +1,40 @@
+import torch
+from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
+
+
+class SingleDevicePlugin(TrainingTypePlugin):
+    def __init__(self, device):
+        super().__init__()
+        self.device: torch.device = device
+
+    @property
+    def on_gpu(self):
+        return self.device.type == "cuda" and torch.cuda.is_available()
+
+    def reduce(self, output, *args, **kwargs):
+        return output
+
+    @property
+    def root_device(self):
+        return self.device
+    
+    def model_to_device(self):
+        if self.on_gpu:
+            torch.cuda.set_device(self.root_device)
+
+        self._model.to(self.root_device)
+
+    def connect(self, model: torch.nn.Module):
+        self._model = model
+        self.model_to_device()
+        return self.model
+
+    @property
+    def is_global_zero(self):
+        return True
+
+    def barrier(self, *args, **kwargs):
+        pass
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        return obj
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
new file mode 100644
index 0000000000000..94d4dbf9d3409
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
@@ -0,0 +1,93 @@
+import os
+
+from abc import ABC, abstractmethod
+from typing import Optional
+import torch
+
+from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
+
+from pytorch_lightning import _logger as log
+
+class TrainingTypePlugin(Plugin, ABC):
+    def __init__(self):
+        self._model = None
+        self._results = None
+        self.global_rank = 0
+
+    @property
+    @abstractmethod
+    def on_gpu(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def root_device(self) -> torch.device:
+        raise NotImplementedError
+
+    @abstractmethod
+    def model_to_device(self):
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def is_global_zero(self):
+        raise NotImplementedError
+
+    @abstractmethod
+    def reduce(self, output, *args, **kwargs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def barrier(self, name: Optional[str] = None):
+        raise NotImplementedError
+
+    @abstractmethod
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        raise NotImplementedError
+
+    # TODO method this is currently unused
+    def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids):
+        if device_ids is None:
+            return
+
+        # set the correct cuda visible devices (using pci order)
+        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
+        devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
+        log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')
+
+    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
+        return should_stop
+
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, new_model):
+        self._model = new_model
+
+    @property
+    def lightning_module(self):
+        return self._model
+
+    @property
+    def results(self):
+        """
+        The results of the last training/testing run will be cached here.
+        In distributed training, we make sure to transfer the results to the appropriate master process.
+        """
+        # TODO: improve these docs
+        return self._results
+
+    @property
+    def rpc_enabled(self):
+        return False
+
+    def start_training(self, trainer):
+        # double dispatch to initiate the training loop
+        self._results = trainer.train()
+
+    def start_testing(self, trainer):
+        # double dispatch to initiate the test loop
+        self._results = trainer.run_test()
diff --git a/pytorch_lightning/accelerators/scheduler_properties.py b/pytorch_lightning/accelerators/scheduler_properties.py
deleted file mode 100644
index 37dbdd13c3c58..0000000000000
--- a/pytorch_lightning/accelerators/scheduler_properties.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from torch import optim
-
-
-def reinit_scheduler_properties(optimizers: list, schedulers: list):
-        # Reinitialize optimizer.step properties added by schedulers
-        for scheduler in schedulers:
-            scheduler = scheduler['scheduler']
-
-            for optimizer in optimizers:
-                state = None
-                idx = 0
-
-                # check that we dont mix users optimizers and schedulers
-                if scheduler.optimizer == optimizer:
-                    # Find the mro belonging to the base lr scheduler class
-                    for i, mro in enumerate(scheduler.__class__.__mro__):
-                        if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau):
-                            idx = i
-                            state = scheduler.state_dict()
-                        else:
-                            state = None
-
-                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
-                if state is not None:
-                    scheduler.load_state_dict(state)
\ No newline at end of file

From ee53c90fd06fef04cfec7f22feb73cd9e720d5b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 7 Jan 2021 21:21:02 +0100
Subject: [PATCH 100/157] fix all new import errors

---
 pytorch_lightning/accelerators/__init__.py    |  4 ++++
 pytorch_lightning/accelerators/accelerator.py |  8 ++------
 .../accelerators/accelerator_connector.py     | 16 +++++++--------
 pytorch_lightning/accelerators/cpu.py         |  2 +-
 .../plugins/precision/apex_amp.py             |  4 ++--
 .../accelerators/plugins/training_type/ddp.py | 20 +++++++++++++------
 .../plugins/training_type/horovod.py          |  4 ++--
 .../cluster_environment.py                    |  2 +-
 pytorch_lightning/plugins/old/apex.py         |  2 +-
 pytorch_lightning/plugins/old/ddp_plugin.py   |  2 +-
 .../plugins/old/ddp_sequential_plugin.py      |  2 +-
 pytorch_lightning/plugins/old/native_amp.py   |  2 +-
 .../plugins/old/plugin_connector.py           | 10 +++++-----
 .../plugins/old/precision_plugin.py           |  2 +-
 pytorch_lightning/plugins/old/rpc_plugin.py   |  2 +-
 .../plugins/old/sharded_native_amp_plugin.py  |  2 +-
 .../plugins/old/sharded_plugin.py             |  4 ++--
 .../trainer/connectors/precision_connector.py |  4 ++--
 pytorch_lightning/trainer/trainer.py          |  5 +----
 pytorch_lightning/trainer/training_loop.py    |  2 +-
 tests/backends/test_accelerator_connector.py  |  4 ++--
 tests/models/test_gpu.py                      |  4 +---
 tests/models/test_horovod.py                  |  3 +--
 tests/models/test_tpu.py                      |  2 +-
 tests/plugins/test_plugin_properties.py       |  2 +-
 25 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py
index e69de29bb2d1d..2ec118303d153 100644
--- a/pytorch_lightning/accelerators/__init__.py
+++ b/pytorch_lightning/accelerators/__init__.py
@@ -0,0 +1,4 @@
+from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.cpu import CPUAccelerator
+from pytorch_lightning.accelerators.gpu import GPUAccelerator
+from pytorch_lightning.accelerators.tpu import TPUAccelerator
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 81eb112206d28..f9b18304316ef 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,8 +1,4 @@
-import os
-
-from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.data_parallel import TrainingTypePlugin, HorovodPlugin
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin
 from pytorch_lightning.utilities import AMPType
 from typing import Any
 import math
@@ -11,7 +7,7 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning.core import LightningModule
-from pytorch_lightning.accelerators.precision import (
+from pytorch_lightning.accelerators.plugins.precision import (
     ApexMixedPrecisionPlugin,
     MixedPrecisionPlugin,
     NativeMixedPrecisionPlugin,
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index e03e51cbba6ed..e3467e4be3617 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -18,11 +18,11 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
-from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
+from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
     DataParallelPlugin, DDP2Plugin, HorovodPlugin
-from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
+from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
-from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser
+from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -40,9 +40,9 @@
 try:
     import horovod.torch as hvd
 except (ModuleNotFoundError, ImportError):
-    HOROVOD_AVAILABLE = False
+    _HOROVOD_AVAILABLE = False
 else:
-    HOROVOD_AVAILABLE = True
+    _HOROVOD_AVAILABLE = True
 
 
 class BackendConnector(object):
@@ -180,7 +180,7 @@ def select_precision_plugin(self):
 
         elif self.precision == 16:
             if self.amp_type == 'native':
-                if not NATIVE_AMP_AVAILABLE:
+                if not _NATIVE_AMP_AVAILABLE:
                     rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.'
                                 ' Consider upgrading with `pip install torch>=1.6`.'
                                 ' We will attempt to use NVIDIA Apex for this session.')
@@ -191,7 +191,7 @@ def select_precision_plugin(self):
                     return NativeMixedPrecisionPlugin()
 
             if self.amp_type =='apex':
-                if not APEX_AVAILABLE:
+                if not _APEX_AVAILABLE:
                     rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.'
                                 ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux')
                 else:
@@ -371,7 +371,7 @@ def _set_horovod_backend(self):
 
     def check_horovod(self):
         """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod."""
-        if not HOROVOD_AVAILABLE:
+        if not _HOROVOD_AVAILABLE:
             raise MisconfigurationException(
                 'Requested `distributed_backend="horovod"`, but Horovod is not installed.'
                 "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]"
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index e9f49e20a464f..820fab6d7d0f8 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -1,4 +1,4 @@
-from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
index 9bb749bf18dbb..08b4fe7906732 100644
--- a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
+++ b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
@@ -3,10 +3,10 @@
 import torch
 from torch.optim import Optimizer
 from pytorch_lightning.core import LightningModule
-from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities import AMPType, _APEX_AVAILABLE, rank_zero_warn
 from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
 
-if APEX_AVAILABLE:
+if _APEX_AVAILABLE:
     from apex import amp
 
 class ApexMixedPrecisionPlugin(MixedPrecisionPlugin):
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
index ec275f227016a..4e865a959ae73 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
@@ -3,13 +3,14 @@
 import subprocess
 from time import sleep
 import numpy as np
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import torch
 import torch.distributed as torch_distrib
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.utilities import HYDRA_AVAILABLE
+from pytorch_lightning.distributed import LightningDistributed
+from pytorch_lightning.utilities import _HYDRA_AVAILABLE
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
@@ -17,10 +18,17 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
 
-if HYDRA_AVAILABLE:
+if _HYDRA_AVAILABLE:
     from hydra.utils import to_absolute_path, get_original_cwd
     from hydra.core.hydra_config import HydraConfig
 
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+else:
+
+    class ReduceOp:
+        SUM = None
+
 
 class DDPPlugin(ParallelPlugin):
 
@@ -38,7 +46,7 @@ def __init__(
         self.interactive_ddp_procs = []
         self.num_nodes = num_nodes
         self.sync_batchnorm = sync_batchnorm
-        self.dist = LightningDistributedDataParallel()
+        self.dist = LightningDistributed()
         self._ddp_kwargs = kwargs
         self._has_spawned_children = False
         self.task_idx = None
@@ -89,7 +97,7 @@ def _call_children_scripts(self):
         os.environ["LOCAL_RANK"] = "0"
 
         # when user is using hydra find the absolute path
-        path_lib = os.path.abspath if not HYDRA_AVAILABLE else to_absolute_path
+        path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path
 
         # pull out the commands used to run the script and resolve the abs file path
         command = sys.argv
@@ -131,7 +139,7 @@ def _call_children_scripts(self):
             # start process
             # if hydra is available and initialized, make sure to set the cwd correctly
             cwd: Optional[str] = None
-            if HYDRA_AVAILABLE:
+            if _HYDRA_AVAILABLE:
                 if HydraConfig.initialized():
                     cwd = get_original_cwd()
             proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/accelerators/plugins/training_type/horovod.py
index 72e14c1a6a790..fee77f762fde1 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/horovod.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/horovod.py
@@ -4,11 +4,11 @@
 
 import torch
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
-from pytorch_lightning.utilities import HOROVOD_AVAILABLE
+from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from torch.optim.lr_scheduler import _LRScheduler
 
-if HOROVOD_AVAILABLE:
+if _HOROVOD_AVAILABLE:
     import horovod.torch as hvd
 
 if torch.distributed.is_available():
diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py
index 6de290cd63ee9..8652d701dbf83 100644
--- a/pytorch_lightning/cluster_environments/cluster_environment.py
+++ b/pytorch_lightning/cluster_environments/cluster_environment.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pytorch_lightning.plugins.plugin import LightningPlugin
+from pytorch_lightning.plugins.old.plugin import LightningPlugin
 
 
 class ClusterEnvironment(LightningPlugin):
diff --git a/pytorch_lightning/plugins/old/apex.py b/pytorch_lightning/plugins/old/apex.py
index f80461e5d4fe5..d917924eb0960 100644
--- a/pytorch_lightning/plugins/old/apex.py
+++ b/pytorch_lightning/plugins/old/apex.py
@@ -17,7 +17,7 @@
 from torch.optim.optimizer import Optimizer
 
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
+from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 
diff --git a/pytorch_lightning/plugins/old/ddp_plugin.py b/pytorch_lightning/plugins/old/ddp_plugin.py
index f0da9e5ff1a2d..360479de5a665 100644
--- a/pytorch_lightning/plugins/old/ddp_plugin.py
+++ b/pytorch_lightning/plugins/old/ddp_plugin.py
@@ -22,7 +22,7 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.data_parallel import LightningDistributedModule, prepare_for_backward
-from pytorch_lightning.plugins.plugin import LightningPlugin
+from pytorch_lightning.plugins.old.plugin import LightningPlugin
 from pytorch_lightning.utilities import DeviceType
 
 
diff --git a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py b/pytorch_lightning/plugins/old/ddp_sequential_plugin.py
index 82250d1ed9fdd..dc39d648d2f13 100644
--- a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py
+++ b/pytorch_lightning/plugins/old/ddp_sequential_plugin.py
@@ -21,7 +21,7 @@
 
 from pytorch_lightning import LightningModule
 from pytorch_lightning import _logger as log
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.plugins.old.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
diff --git a/pytorch_lightning/plugins/old/native_amp.py b/pytorch_lightning/plugins/old/native_amp.py
index 4df5d128476a4..832d6acc672b4 100644
--- a/pytorch_lightning/plugins/old/native_amp.py
+++ b/pytorch_lightning/plugins/old/native_amp.py
@@ -16,7 +16,7 @@
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
+from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin
 
 
 class NativeAMPPlugin(PrecisionPlugin):
diff --git a/pytorch_lightning/plugins/old/plugin_connector.py b/pytorch_lightning/plugins/old/plugin_connector.py
index e1071fa24ec04..77dae1229743e 100644
--- a/pytorch_lightning/plugins/old/plugin_connector.py
+++ b/pytorch_lightning/plugins/old/plugin_connector.py
@@ -15,11 +15,11 @@
 from typing import List, Optional, Union
 
 from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.plugins.apex import ApexPlugin
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
-from pytorch_lightning.plugins.plugin import LightningPlugin
-from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
+from pytorch_lightning.plugins.old.apex import ApexPlugin
+from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.old.plugin import LightningPlugin
+from pytorch_lightning.plugins.old.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import AMPType, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
diff --git a/pytorch_lightning/plugins/old/precision_plugin.py b/pytorch_lightning/plugins/old/precision_plugin.py
index aaac3ede3c623..69d8e3670678d 100644
--- a/pytorch_lightning/plugins/old/precision_plugin.py
+++ b/pytorch_lightning/plugins/old/precision_plugin.py
@@ -15,7 +15,7 @@
 
 from torch.optim import Optimizer
 
-from pytorch_lightning.plugins.plugin import LightningPlugin
+from pytorch_lightning.plugins.old.plugin import LightningPlugin
 
 
 class PrecisionPlugin(LightningPlugin):
diff --git a/pytorch_lightning/plugins/old/rpc_plugin.py b/pytorch_lightning/plugins/old/rpc_plugin.py
index fd3825a343463..4445b1d35970e 100644
--- a/pytorch_lightning/plugins/old/rpc_plugin.py
+++ b/pytorch_lightning/plugins/old/rpc_plugin.py
@@ -18,7 +18,7 @@
 import torch
 
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin
 from pytorch_lightning.utilities import _RPC_AVAILABLE
 
 DEFAULT_RPC_TIMEOUT_SEC = 60.
diff --git a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py
index 5ddd29521203d..c29821dcd8a8d 100644
--- a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py
@@ -15,7 +15,7 @@
 
 from torch.optim import Optimizer
 
-from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 
 if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
diff --git a/pytorch_lightning/plugins/old/sharded_plugin.py b/pytorch_lightning/plugins/old/sharded_plugin.py
index ec1500ca7abf4..19e0859587585 100644
--- a/pytorch_lightning/plugins/old/sharded_plugin.py
+++ b/pytorch_lightning/plugins/old/sharded_plugin.py
@@ -15,8 +15,8 @@
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.sharded_native_amp_plugin import ShardedNativeAMPPlugin
+from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.old.sharded_native_amp_plugin import ShardedNativeAMPPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, AMPType, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
diff --git a/pytorch_lightning/trainer/connectors/precision_connector.py b/pytorch_lightning/trainer/connectors/precision_connector.py
index 78f1635fb7f4d..af8db214eff9d 100644
--- a/pytorch_lightning/trainer/connectors/precision_connector.py
+++ b/pytorch_lightning/trainer/connectors/precision_connector.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.plugins.apex import ApexPlugin
-from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.old.apex import ApexPlugin
+from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, AMPType, rank_zero_warn
 
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 4d0718c5e2b48..5bf2fdcea7991 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -16,7 +16,6 @@
 
 import os
 from pytorch_lightning.core.memory import ModelSummary
-from pytorch_lightning.accelerators.precision import PrecisionPlugin
 import warnings
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Union
@@ -25,7 +24,7 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.core.memory import ModelSummary
+from pytorch_lightning.plugins.old.plugin_connector import PluginConnector
 from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
@@ -34,7 +33,6 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.step_result import EvalResult, Result
 from pytorch_lightning.loggers import LightningLoggerBase
-from pytorch_lightning.plugins.plugin_connector import PluginConnector
 from pytorch_lightning.profiler import BaseProfiler
 from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
 from pytorch_lightning.trainer.configuration_validator import ConfigValidator
@@ -78,7 +76,6 @@
 from pytorch_lightning.utilities.memory import recursive_detach
 from pytorch_lightning.utilities.model_utils import is_overridden
 from pytorch_lightning.trainer.properties import TrainerProperties
-from pytorch_lightning.plugins.plugin_connector import PluginConnector
 from pytorch_lightning.accelerators.accelerator import Accelerator
 
 # warnings to ignore in trainer
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 7c010ba72c137..b3510f0f400fe 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -18,7 +18,7 @@
 import numpy as np
 import torch
 
-from pytorch_lightning.accelerators.data_parallel import ParallelPlugin
+from pytorch_lightning.accelerators.plugins import ParallelPlugin
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.memory import ModelSummary
diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index b6f27f32a85fc..92950274e49cd 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -22,8 +22,8 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
-from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin
-from pytorch_lightning.accelerators.precision import PrecisionPlugin
+from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin
+from pytorch_lightning.accelerators.plugins import PrecisionPlugin
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
 from tests.base.boring_model import BoringModel
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 5643dce5a6160..bcc3709d129cf 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -21,11 +21,9 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
-
+from tests.base import BoringModel
 
 PRETEND_N_OF_GPUS = 16
 
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index ca56a987aab98..62782921ef85c 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -26,8 +26,7 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.accelerator import CPUAccelerator
-from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult
+from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 8278ef60dc6bd..20e9473b3a910 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -20,7 +20,7 @@
 
 import tests.base.develop_pipelines as tpipes
 from pytorch_lightning import Trainer, seed_everything
-from pytorch_lightning.accelerators.accelerator import TPUAccelerator
+from pytorch_lightning.accelerators import TPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE
diff --git a/tests/plugins/test_plugin_properties.py b/tests/plugins/test_plugin_properties.py
index 5466bd07cd03a..ef87a79d4bb5c 100644
--- a/tests/plugins/test_plugin_properties.py
+++ b/tests/plugins/test_plugin_properties.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.plugin_connector import LightningCustomPlugins, PluginConnector
+from pytorch_lightning.plugins.old.plugin_connector import LightningCustomPlugins, PluginConnector
 
 
 def test_available_plugins_trainer():

From 894e604f7b3fcc8284035c6efefc5ec722346dc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 7 Jan 2021 21:27:36 +0100
Subject: [PATCH 101/157] fix wrong arguments order passed to test

---
 tests/trainer/test_dataloaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 42d9072e476d6..b3105e97e18c1 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -129,7 +129,7 @@ def test_multiple_val_dataloader(tmpdir):
 
     # make sure predictions are good for each val set
     for dataloader in trainer.val_dataloaders:
-        tpipes.run_prediction(dataloader, model)
+        tpipes.run_prediction(trained_model=model, dataloader=dataloader)
 
 
 @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific'])

From 2bdc836b24b095cec757dd36bd73491b0d6fdd7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Jan 2021 04:52:51 +0100
Subject: [PATCH 102/157] fix LR finder

---
 pytorch_lightning/trainer/properties.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 494e91a298843..2e7e122730472 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -15,7 +15,7 @@
 import os
 from abc import ABC
 from argparse import ArgumentParser, Namespace
-from typing import cast, List, Optional, Type, TypeVar, Union
+from typing import cast, List, Optional, Type, TypeVar, Union, Any
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
@@ -358,7 +358,7 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
         self.checkpoint_connector.save_checkpoint(filepath, weights_only)
 
     @property
-    def model(self):
+    def model(self) -> Any:
         """
         The LightningModule, but possibly wrapped into DataParallel or DistributedDataParallel.
         To access the pure LightningModule, use
@@ -366,6 +366,18 @@ def model(self):
         """
         return self.accelerator.model
 
+    @model.setter
+    def model(self, model: Any):
+        """
+        Setter for the model, pass-through to accelerator and plugin where the model reference is stored.
+        Used by the Tuner to reset the state of Trainer and Accelerator.
+
+        Args:
+            model: The LightningModule, possibly wrapped into DataParallel or DistributedDataParallel, depending
+                on the backend.
+        """
+        self.accelerator.model = model
+
     def get_model(self):
         # TODO: rename this to lightning_module (see training type plugin)
         # backward compatible

From 48b9882e52768e079c15f399556e4f58a6675029 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Mon, 11 Jan 2021 14:04:05 +0000
Subject: [PATCH 103/157] Added sharded training type and amp plugin

---
 .../plugins/precision/__init__.py             |  1 +
 .../plugins/precision/sharded_native_amp.py   | 37 +++++++++++++++++++
 .../plugins/training_type/__init__.py         |  1 +
 .../plugins/training_type/sharded.py          | 36 ++++++++++++++++++
 4 files changed, 75 insertions(+)
 create mode 100644 pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/sharded.py

diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py
index 4f30fe58910f4..e4c6f2076e14b 100644
--- a/pytorch_lightning/accelerators/plugins/precision/__init__.py
+++ b/pytorch_lightning/accelerators/plugins/precision/__init__.py
@@ -1,4 +1,5 @@
 from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin
 from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
 from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
new file mode 100644
index 0000000000000..fb332f0572fd6
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
@@ -0,0 +1,37 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union, cast
+
+from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE
+from torch.optim import Optimizer
+
+from pytorch_lightning.accelerators.plugins import NativeMixedPrecisionPlugin
+
+if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
+    from fairscale.optim import OSS
+    from fairscale.optim.grad_scaler import ShardedGradScaler
+
+
+class ShardedNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin):
+
+    def __init__(self):
+        super().__init__()
+        self.scaler = ShardedGradScaler()
+
+    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
+        # todo: accelerator needs to rely on precision plugin to clip gradients.
+        max_norm = grad_clip_val
+        norm_type = float(2.0)
+        optimizer = cast(OSS, optimizer)
+        optimizer.clip_grad_norm(max_norm, norm_type=norm_type)
diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
index 532ea418a40bd..d9955969480f7 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
@@ -2,6 +2,7 @@
 from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin
 from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin
+from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedPlugin
 from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
new file mode 100644
index 0000000000000..83aa2f317b07b
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
@@ -0,0 +1,36 @@
+from pytorch_lightning.accelerators.plugins import DDPPlugin
+from pytorch_lightning.core.optimizer import is_lightning_optimizer
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
+
+if _FAIRSCALE_AVAILABLE:
+    from fairscale.optim import OSS
+
+    from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel
+
+
+class ShardedPlugin(DDPPlugin):
+    def configure_ddp(self):
+        self._model = LightningShardedDataParallel(
+            self.model,
+            sharded_optimizer=self.lightning_module.trainer.optimizers
+        )
+
+    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
+        super().init_ddp_connection(global_rank, world_size)
+        self._reinit_optimizers_with_oss()
+
+    def _reinit_optimizers_with_oss(self):
+        optimizers = self.lightning_module.trainer.optimizers
+        for x, optimizer in enumerate(optimizers):
+            if is_lightning_optimizer(optimizer):
+                optimizer = optimizer._optimizer
+            if not isinstance(optimizer, OSS):
+                optim_class = type(optimizer)
+                zero_optimizer = OSS(
+                    params=optimizer.param_groups,
+                    optim=optim_class,
+                    **optimizer.defaults
+                )
+                optimizers[x] = zero_optimizer
+                del optimizer
+        self.lightning_module.trainer.convert_to_lightning_optimizers()

From 38452b643ad9bf0444503b3d43a46ff9bfbf2c7e Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Mon, 11 Jan 2021 17:08:35 +0000
Subject: [PATCH 104/157] Move clip grad to precision plugin

---
 pytorch_lightning/accelerators/accelerator.py | 38 +----------------
 .../plugins/precision/precision_plugin.py     | 42 ++++++++++++++++++-
 .../plugins/precision/sharded_native_amp.py   |  7 +---
 3 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index f9b18304316ef..3a6c0e8f6bfbe 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -145,43 +145,7 @@ def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx):
         model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx)
 
     def clip_gradients(self, optimizer, clip_val):
-        # TODO: separate TPU case from here
-        self._clip_gradients(optimizer, clip_val)
-
-    def _clip_gradients(self, optimizer, grad_clip_val):
-        if grad_clip_val is None:
-            return
-
-        grad_clip_val = float(grad_clip_val)
-
-        if grad_clip_val <= 0:
-            return
-
-        parameters = self.precision_plugin.master_params(optimizer)
-
-        max_norm = grad_clip_val
-        norm_type = float(2.0)
-
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        parameters = list(filter(lambda p: p.grad is not None, parameters))
-
-        device = parameters[0].device
-
-        if norm_type == math.inf:
-            total_norm = max(p.grad.data.abs().max() for p in parameters)
-        else:
-            out = torch.empty(len(parameters), device=device)
-            for i, p in enumerate(parameters):
-                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
-            total_norm = torch.norm(out, norm_type)
-
-        eps = self.precision_plugin.EPSILON
-
-        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
-        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
-        for p in parameters:
-            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
+        self.precision_plugin.clip_gradients(optimizer, clip_val)
 
     def on_train_epoch_end(self, outputs):
         pass
diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
index 048a645de250a..6098edfde60b4 100644
--- a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
@@ -1,4 +1,9 @@
+import math
+from typing import Union
+
 import torch
+from torch.optim import Optimizer
+
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
 
@@ -42,4 +47,39 @@ def backward(
         # once backward has been applied, release graph
         closure_loss = closure_loss.detach()
 
-        return closure_loss
\ No newline at end of file
+        return closure_loss
+
+    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)):
+        # TODO: separate TPU case from here
+        if clip_val is None:
+            return
+
+        grad_clip_val = float(clip_val)
+
+        if grad_clip_val <= 0:
+            return
+
+        parameters = self.master_params(optimizer)
+
+        max_norm = grad_clip_val
+
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+        device = parameters[0].device
+
+        if norm_type == math.inf:
+            total_norm = max(p.grad.data.abs().max() for p in parameters)
+        else:
+            out = torch.empty(len(parameters), device=device)
+            for i, p in enumerate(parameters):
+                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
+            total_norm = torch.norm(out, norm_type)
+
+        eps = self.EPSILON
+
+        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
+        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
+        for p in parameters:
+            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
index fb332f0572fd6..4d27cb2cebc04 100644
--- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
+++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
@@ -29,9 +29,6 @@ def __init__(self):
         super().__init__()
         self.scaler = ShardedGradScaler()
 
-    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
-        # todo: accelerator needs to rely on precision plugin to clip gradients.
-        max_norm = grad_clip_val
-        norm_type = float(2.0)
+    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)):
         optimizer = cast(OSS, optimizer)
-        optimizer.clip_grad_norm(max_norm, norm_type=norm_type)
+        optimizer.clip_grad_norm(clip_val, norm_type=norm_type)

From 173b22c49c9efff79b090bbe21fcae3773137e44 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 12 Jan 2021 15:40:55 +0000
Subject: [PATCH 105/157] Added sharded spawn, select accelerators based on
 distributed_backend + enable custom fp16 plugin automatically

---
 .../accelerators/accelerator_connector.py     | 19 +++++++---
 .../plugins/training_type/__init__.py         |  3 +-
 .../plugins/training_type/sharded.py          |  2 +-
 .../plugins/training_type/sharded_spawn.py    | 36 +++++++++++++++++++
 4 files changed, 53 insertions(+), 7 deletions(-)
 create mode 100644 pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index e3467e4be3617..65529ddc89825 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -19,8 +19,9 @@
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
-    DataParallelPlugin, DDP2Plugin, HorovodPlugin
-from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
+    DataParallelPlugin, DDP2Plugin, HorovodPlugin, ShardedDDPPlugin, ShardedSpawnDDPPlugin
+from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \
+    PrecisionPlugin, ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
@@ -187,13 +188,15 @@ def select_precision_plugin(self):
                     self.amp_type = 'apex'
                 else:
                     log.info('Using native 16bit precision.')
+                    if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn':
+                        return ShardedNativeMixedPrecisionPlugin()
                     self.amp_type = AMPType.NATIVE
                     return NativeMixedPrecisionPlugin()
 
-            if self.amp_type =='apex':
+            if self.amp_type == 'apex':
                 if not _APEX_AVAILABLE:
                     rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.'
-                                ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux')
+                                   ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux')
                 else:
                     log.info('Using APEX 16bit precision.')
                     self.amp_type = AMPType.APEX
@@ -215,13 +218,19 @@ def select_training_type_plugin(self):
             use_ddp_cpu_spawn = self.use_ddp and self.distributed_backend == "ddp_cpu"
             use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
             use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
+            use_ddp_sharded = self.distributed_backend == "ddp_sharded"
+            use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn"
 
             # ddp script mode uses the same flags as TE
             # TODO: decouple from TE
             if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
                 use_torchelastic_ddp = False
 
-            if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
+            if use_ddp_sharded:
+                ddp_plugin_cls = ShardedDDPPlugin
+            elif use_ddp_sharded_spawn:
+                ddp_plugin_cls = ShardedSpawnDDPPlugin
+            elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                 ddp_plugin_cls = DDPPlugin
             elif use_ddp_spawn or use_ddp_cpu_spawn:
                 ddp_plugin_cls = DDPSpawnPlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
index d9955969480f7..1da1a00e0c1a1 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
@@ -2,7 +2,8 @@
 from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin
 from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin
-from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedPlugin
+from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedDDPPlugin
+from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import ShardedSpawnDDPPlugin
 from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
index 83aa2f317b07b..5aebd58937165 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
@@ -8,7 +8,7 @@
     from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel
 
 
-class ShardedPlugin(DDPPlugin):
+class ShardedDDPPlugin(DDPPlugin):
     def configure_ddp(self):
         self._model = LightningShardedDataParallel(
             self.model,
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
new file mode 100644
index 0000000000000..3f6862cb9ff7f
--- /dev/null
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
@@ -0,0 +1,36 @@
+from pytorch_lightning.accelerators.plugins import DDPSpawnPlugin
+from pytorch_lightning.core.optimizer import is_lightning_optimizer
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
+
+if _FAIRSCALE_AVAILABLE:
+    from fairscale.optim import OSS
+
+    from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel
+
+
+class ShardedSpawnDDPPlugin(DDPSpawnPlugin):
+    def configure_ddp(self):
+        self._model = LightningShardedDataParallel(
+            self.model,
+            sharded_optimizer=self.lightning_module.trainer.optimizers
+        )
+
+    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
+        super().init_ddp_connection(global_rank, world_size)
+        self._reinit_optimizers_with_oss()
+
+    def _reinit_optimizers_with_oss(self):
+        optimizers = self.lightning_module.trainer.optimizers
+        for x, optimizer in enumerate(optimizers):
+            if is_lightning_optimizer(optimizer):
+                optimizer = optimizer._optimizer
+            if not isinstance(optimizer, OSS):
+                optim_class = type(optimizer)
+                zero_optimizer = OSS(
+                    params=optimizer.param_groups,
+                    optim=optim_class,
+                    **optimizer.defaults
+                )
+                optimizers[x] = zero_optimizer
+                del optimizer
+        self.lightning_module.trainer.convert_to_lightning_optimizers()

From 79803f69c61cfaeea71741e1c337792917bdd8a6 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 12 Jan 2021 16:57:09 +0000
Subject: [PATCH 106/157] Fix import issue, attempting to fix tests

---
 benchmarks/test_sharded_parity.py             | 51 +++++--------------
 .../accelerators/accelerator_connector.py     | 10 ++--
 .../plugins/precision/__init__.py             |  2 +-
 .../plugins/precision/sharded_native_amp.py   |  2 +-
 .../plugins/training_type/__init__.py         |  4 +-
 .../plugins/training_type/sharded.py          |  4 +-
 .../plugins/training_type/sharded_spawn.py    |  4 +-
 tests/plugins/test_sharded_plugin.py          | 21 ++++----
 8 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 05fde8e11523a..67b2c2e7c70a1 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -15,14 +15,12 @@
 import os
 import platform
 import time
-from typing import Type, Union
+from typing import Type
 
 import pytest
 import torch
 
 from pytorch_lightning import seed_everything, Trainer
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 from tests.backends import DDPLauncher
 from tests.base.boring_model import BoringModel, RandomDataset
@@ -32,10 +30,8 @@
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_one_gpu():
-    plugin_parity_test(
+    sharded_parity_test(
         gpus=1,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -45,11 +41,9 @@ def test_ddp_sharded_plugin_correctness_one_gpu():
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_amp_one_gpu():
-    plugin_parity_test(
+    sharded_parity_test(
         gpus=1,
         precision=16,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -59,10 +53,8 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu():
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_multi_gpu():
-    plugin_parity_test(
+    sharded_parity_test(
         gpus=2,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -73,11 +65,9 @@ def test_ddp_sharded_plugin_correctness_multi_gpu():
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
-    plugin_parity_test(
+    sharded_parity_test(
         gpus=2,
         precision=16,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -88,11 +78,9 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
-    plugin_parity_test(
+    sharded_parity_test(
         gpus=2,
         precision=16,
-        accelerator='ddp_spawn',
-        plugin='ddp_sharded',
         model_cls=SeedTrainLoaderModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -104,11 +92,9 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
                     reason="test should be run outside of pytest")
 @DDPLauncher.run("--accelerator ddp --gpus 2 --precision 32")
 def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None):
-    plugin_parity_test(
+    sharded_parity_test(
         gpus=args.gpus,
         precision=args.precision,
-        accelerator=args.accelerator,
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -119,11 +105,9 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None):
                     reason="test should be run outside of pytest")
 @DDPLauncher.run("--accelerator ddp --gpus 2  --precision 16")
 def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
-    plugin_parity_test(
+    sharded_parity_test(
         gpus=args.gpus,
         precision=args.precision,
-        accelerator=args.accelerator,
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -136,10 +120,8 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
     """
         Ensures same results using multiple optimizers across multiple GPUs
     """
-    plugin_parity_test(
-        plugin=DDPShardedPlugin(),
+    sharded_parity_test(
         gpus=2,
-        accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderMultipleOptimizersModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -153,10 +135,8 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
     """
         Ensures using multiple optimizers across multiple GPUs with manual optimization
     """
-    plugin_parity_test(
-        plugin=DDPShardedPlugin(),
+    sharded_parity_test(
         gpus=2,
-        accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderManualModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -253,11 +233,9 @@ def record_ddp_fit_model_stats(trainer, model, use_cuda):
     return max_memory, total_time
 
 
-def plugin_parity_test(
+def sharded_parity_test(
         model_cls: Type[SeedTrainLoaderModel],
-        plugin: Union[str, DDPPlugin],
         seed: int = 42,
-        accelerator: str = 'ddp_spawn',
         gpus: int = 0,
         precision: int = 32,
         max_percent_speed_diff: float = 0.1,
@@ -268,9 +246,7 @@ def plugin_parity_test(
 
     Args:
         model_cls: Model class to use for test.
-        plugin: Plugin to parity test.
         seed: Seed for generators. Note that this does not handle the seed for data-loading on multi-process.
-        accelerator: Accelerator type for test.
         gpus: Number of GPUS to enable.
         precision: Whether to use AMP or normal FP32 training.
         max_percent_speed_diff: The maximum speed difference compared to normal DDP training.
@@ -288,7 +264,7 @@ def plugin_parity_test(
         max_epochs=1,
         gpus=gpus,
         precision=precision,
-        accelerator=accelerator,
+        accelerator='ddp_spawn',
     )
 
     max_memory_ddp, ddp_time = record_ddp_fit_model_stats(
@@ -306,8 +282,7 @@ def plugin_parity_test(
         max_epochs=1,
         gpus=gpus,
         precision=precision,
-        accelerator=accelerator,
-        plugins=[plugin],
+        accelerator='ddp_sharded_spawn',
     )
 
     max_memory_custom, custom_model_time = record_ddp_fit_model_stats(
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 65529ddc89825..eca02dbc2f902 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -19,7 +19,7 @@
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
-    DataParallelPlugin, DDP2Plugin, HorovodPlugin, ShardedDDPPlugin, ShardedSpawnDDPPlugin
+    DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin
 from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \
     PrecisionPlugin, ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
@@ -227,9 +227,9 @@ def select_training_type_plugin(self):
                 use_torchelastic_ddp = False
 
             if use_ddp_sharded:
-                ddp_plugin_cls = ShardedDDPPlugin
+                ddp_plugin_cls = DDPShardedPlugin
             elif use_ddp_sharded_spawn:
-                ddp_plugin_cls = ShardedSpawnDDPPlugin
+                ddp_plugin_cls = DDPSpawnShardedPlugin
             elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                 ddp_plugin_cls = DDPPlugin
             elif use_ddp_spawn or use_ddp_cpu_spawn:
@@ -347,6 +347,10 @@ def set_distributed_mode(self):
             self.parallel_device_ids = None
             self.use_ddp = True
 
+        # Sharded DDP
+        elif self.distributed_backend in ("ddp_sharded", "ddp_sharded_spawn"):
+            self.use_ddp = True
+
         # HOROVOD
         elif self.distributed_backend == "horovod":
             self._set_horovod_backend()
diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py
index e4c6f2076e14b..0c7265f4be29d 100644
--- a/pytorch_lightning/accelerators/plugins/precision/__init__.py
+++ b/pytorch_lightning/accelerators/plugins/precision/__init__.py
@@ -1,5 +1,5 @@
 from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin
 from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
 from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.accelerators.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
index 4d27cb2cebc04..9df1e330bef47 100644
--- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
+++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
@@ -16,7 +16,7 @@
 from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE
 from torch.optim import Optimizer
 
-from pytorch_lightning.accelerators.plugins import NativeMixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin
 
 if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
index 1da1a00e0c1a1..8ff2d65c4f6d7 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
@@ -2,8 +2,8 @@
 from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin
 from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin
-from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedDDPPlugin
-from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import ShardedSpawnDDPPlugin
+from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin
+from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
 from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
index 5aebd58937165..ea5842c4b34d5 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
@@ -1,4 +1,4 @@
-from pytorch_lightning.accelerators.plugins import DDPPlugin
+from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 
@@ -8,7 +8,7 @@
     from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel
 
 
-class ShardedDDPPlugin(DDPPlugin):
+class DDPShardedPlugin(DDPPlugin):
     def configure_ddp(self):
         self._model = LightningShardedDataParallel(
             self.model,
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
index 3f6862cb9ff7f..a38d283cdc003 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
@@ -1,4 +1,4 @@
-from pytorch_lightning.accelerators.plugins import DDPSpawnPlugin
+from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 
@@ -8,7 +8,7 @@
     from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel
 
 
-class ShardedSpawnDDPPlugin(DDPSpawnPlugin):
+class DDPSpawnShardedPlugin(DDPSpawnPlugin):
     def configure_ddp(self):
         self._model = LightningShardedDataParallel(
             self.model,
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index 80226bc8ef941..fc4f35b33b241 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -6,10 +6,9 @@
 import torch
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.sharded_native_amp_plugin import ShardedNativeAMPPlugin
-from pytorch_lightning.plugins.sharded_plugin import _FAIRSCALE_AVAILABLE, DDPShardedPlugin
-from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base.boring_model import BoringModel
 
@@ -26,28 +25,30 @@
     },
 )
 @mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("torch.cuda.is_available", return_value=True)
 @pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    ["accelerator", "gpus"],
+    [("ddp_sharded", 1), ("ddp_sharded_spawn", 1)]
 )
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes):
+def test_ddp_choice_sharded(tmpdir, accelerator, gpus):
     """
         Test to ensure that plugin is correctly chosen
     """
 
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin)
+            if accelerator == 'ddp_sharded':
+                assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPShardedPlugin)
+            if accelerator == 'ddp_sharded_spawn':
+                assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPSpawnShardedPlugin)
             raise SystemExit()
 
     model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
         gpus=gpus,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        plugins=[DDPShardedPlugin()],
+        accelerator=accelerator,
         callbacks=[CB()],
     )
 

From a7c0d8fb2a195df2ab2d6eb6bf8a6a5106b154f8 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 12 Jan 2021 20:35:07 +0000
Subject: [PATCH 107/157] Fix initial test

---
 tests/plugins/test_sharded_plugin.py | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index fc4f35b33b241..c0b4877e82ad7 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -13,25 +13,12 @@
 from tests.base.boring_model import BoringModel
 
 
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@mock.patch("torch.cuda.is_available", return_value=True)
 @pytest.mark.parametrize(
-    ["accelerator", "gpus"],
-    [("ddp_sharded", 1), ("ddp_sharded_spawn", 1)]
+    ["accelerator"],
+    [("ddp_sharded",), ("ddp_sharded_spawn",)]
 )
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-def test_ddp_choice_sharded(tmpdir, accelerator, gpus):
+def test_sharded_ddp_choice(tmpdir, accelerator):
     """
         Test to ensure that plugin is correctly chosen
     """
@@ -40,14 +27,13 @@ class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
             if accelerator == 'ddp_sharded':
                 assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPShardedPlugin)
-            if accelerator == 'ddp_sharded_spawn':
+            elif accelerator == 'ddp_sharded_spawn':
                 assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPSpawnShardedPlugin)
             raise SystemExit()
 
     model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
-        gpus=gpus,
         accelerator=accelerator,
         callbacks=[CB()],
     )
@@ -67,8 +53,7 @@ def test_invalid_apex_sharded(tmpdir):
     with pytest.raises(MisconfigurationException, match='Sharded Plugin is not supported with Apex AMP'):
         trainer = Trainer(
             fast_dev_run=True,
-            accelerator='ddp_spawn',
-            plugins=[DDPShardedPlugin()],
+            accelerator='ddp_sharded_spawn',
             precision=16,
             amp_backend='apex',
         )

From 02df0adf128d2a0162810bbf3b1b1e7748fb4687 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 14 Jan 2021 12:26:23 +0000
Subject: [PATCH 108/157] Reflect hook logic from master, should wrap model
 after move to device

---
 .../accelerators/accelerator_connector.py     |  5 ++
 .../plugins/training_type/sharded.py          | 32 ++++++++--
 .../plugins/training_type/sharded_spawn.py    | 32 ++++++++--
 tests/plugins/test_sharded_plugin.py          | 64 ++++++-------------
 4 files changed, 76 insertions(+), 57 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index eca02dbc2f902..56fd5e16642e4 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -198,6 +198,11 @@ def select_precision_plugin(self):
                     rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.'
                                    ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux')
                 else:
+                    if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn':
+                        raise MisconfigurationException(
+                            'Sharded Plugin is not supported with Apex AMP, '
+                            'please using native AMP for 16-bit precision.'
+                        )
                     log.info('Using APEX 16bit precision.')
                     self.amp_type = AMPType.APEX
                     return ApexMixedPrecisionPlugin(self.amp_level)
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
index ea5842c4b34d5..1ba54bf8419bb 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py
@@ -1,6 +1,8 @@
+from typing import Optional
+
 from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
@@ -10,15 +12,12 @@
 
 class DDPShardedPlugin(DDPPlugin):
     def configure_ddp(self):
+        self._wrap_optimizers()
         self._model = LightningShardedDataParallel(
             self.model,
             sharded_optimizer=self.lightning_module.trainer.optimizers
         )
 
-    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
-        super().init_ddp_connection(global_rank, world_size)
-        self._reinit_optimizers_with_oss()
-
     def _reinit_optimizers_with_oss(self):
         optimizers = self.lightning_module.trainer.optimizers
         for x, optimizer in enumerate(optimizers):
@@ -33,4 +32,25 @@ def _reinit_optimizers_with_oss(self):
                 )
                 optimizers[x] = zero_optimizer
                 del optimizer
-        self.lightning_module.trainer.convert_to_lightning_optimizers()
+        trainer = self.lightning_module.trainer
+        trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers)
+
+    def _wrap_optimizers(self):
+        trainer = self.model.trainer
+        if trainer.testing is True:
+            return
+        self._reinit_optimizers_with_oss()
+
+    def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]:
+        if is_lightning_optimizer(optimizer):
+            optimizer = optimizer._optimizer
+        optimizer.consolidate_state_dict()
+        return self._optim_state_dict(optimizer)
+
+    @rank_zero_only
+    def _optim_state_dict(self, optimizer):
+        """
+        Retrieves state dict only on rank 0, which contains the entire optimizer state after calling
+        :meth:`consolidate_state_dict`.
+        """
+        return optimizer.state_dict()
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
index a38d283cdc003..d2346831579b8 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
@@ -1,6 +1,8 @@
+from typing import Optional
+
 from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
@@ -10,15 +12,12 @@
 
 class DDPSpawnShardedPlugin(DDPSpawnPlugin):
     def configure_ddp(self):
+        self._wrap_optimizers()
         self._model = LightningShardedDataParallel(
             self.model,
             sharded_optimizer=self.lightning_module.trainer.optimizers
         )
 
-    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
-        super().init_ddp_connection(global_rank, world_size)
-        self._reinit_optimizers_with_oss()
-
     def _reinit_optimizers_with_oss(self):
         optimizers = self.lightning_module.trainer.optimizers
         for x, optimizer in enumerate(optimizers):
@@ -33,4 +32,25 @@ def _reinit_optimizers_with_oss(self):
                 )
                 optimizers[x] = zero_optimizer
                 del optimizer
-        self.lightning_module.trainer.convert_to_lightning_optimizers()
+        trainer = self.lightning_module.trainer
+        trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers)
+
+    def _wrap_optimizers(self):
+        trainer = self.model.trainer
+        if trainer.testing is True:
+            return
+        self._reinit_optimizers_with_oss()
+
+    def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]:
+        if is_lightning_optimizer(optimizer):
+            optimizer = optimizer._optimizer
+        optimizer.consolidate_state_dict()
+        return self._optim_state_dict(optimizer)
+
+    @rank_zero_only
+    def _optim_state_dict(self, optimizer):
+        """
+        Retrieves state dict only on rank 0, which contains the entire optimizer state after calling
+        :meth:`consolidate_state_dict`.
+        """
+        return optimizer.state_dict()
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index c0b4877e82ad7..471f919d3245f 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -1,12 +1,12 @@
 import os
 import platform
-from unittest import mock
 
 import pytest
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin
+from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin, \
+    ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -61,43 +61,28 @@ def test_invalid_apex_sharded(tmpdir):
         trainer.fit(model)
 
 
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    ["accelerator"],
+    [("ddp_sharded",), ("ddp_sharded_spawn",)]
 )
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-def test_ddp_choice_sharded_amp(tmpdir, ddp_backend, gpus, num_processes):
+def test_ddp_choice_sharded_amp(tmpdir, accelerator):
     """
         Test to ensure that plugin native amp plugin is correctly chosen when using sharded
     """
 
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin)
-            assert isinstance(trainer.precision_connector.backend, ShardedNativeAMPPlugin)
+            assert isinstance(trainer.precision_connector.backend, ShardedNativeMixedPrecisionPlugin)
             raise SystemExit()
 
     model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
-        gpus=gpus,
+        gpus=1,
         precision=16,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        plugins=[DDPShardedPlugin()],
+        accelerator=accelerator,
         callbacks=[CB()],
     )
 
@@ -114,9 +99,8 @@ def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 
@@ -142,8 +126,7 @@ def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         gpus=2,
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         fast_dev_run=True,
     )
 
@@ -169,8 +152,7 @@ def test_ddp_sharded_plugin_finetune(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         gpus=2,
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         fast_dev_run=True,
     )
     trainer.fit(model)
@@ -194,9 +176,8 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 
@@ -208,9 +189,8 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
         resume_from_checkpoint=checkpoint_path
     )
@@ -230,8 +210,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         fast_dev_run=True,
         gpus=2,
     )
@@ -244,8 +223,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         fast_dev_run=True,
         gpus=1,
         resume_from_checkpoint=checkpoint_path
@@ -264,8 +242,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         gpus=1,
         fast_dev_run=True
     )
@@ -278,8 +255,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        plugins=[DDPShardedPlugin()],
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
         fast_dev_run=True,
         resume_from_checkpoint=checkpoint_path
@@ -297,9 +273,8 @@ def test_ddp_sharded_plugin_test(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 
@@ -316,9 +291,8 @@ def test_ddp_sharded_plugin_test_multigpu(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_spawn',
+        accelerator='ddp_sharded_spawn',
         gpus=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 

From d0ebcba37e733b26a3bc0e60e35884796102aa14 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 22 Jan 2021 18:03:33 +0100
Subject: [PATCH 109/157] Optional state consolidation, since master has
 optimizers not wrapped

---
 .../accelerators/plugins/training_type/sharded_spawn.py      | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
index d2346831579b8..04e171bb9d5a0 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
@@ -35,6 +35,7 @@ def _reinit_optimizers_with_oss(self):
         trainer = self.lightning_module.trainer
         trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers)
 
+
     def _wrap_optimizers(self):
         trainer = self.model.trainer
         if trainer.testing is True:
@@ -44,7 +45,9 @@ def _wrap_optimizers(self):
     def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]:
         if is_lightning_optimizer(optimizer):
             optimizer = optimizer._optimizer
-        optimizer.consolidate_state_dict()
+
+        if isinstance(optimizer, OSS):
+            optimizer.consolidate_state_dict()
         return self._optim_state_dict(optimizer)
 
     @rank_zero_only

From 319c3e8d8509bf37f598be40c347d114849337f2 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 22 Jan 2021 18:08:20 +0100
Subject: [PATCH 110/157] change attribute for instance test

---
 tests/plugins/test_sharded_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index 471f919d3245f..ac20cd68e36d5 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -74,7 +74,7 @@ def test_ddp_choice_sharded_amp(tmpdir, accelerator):
 
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.precision_connector.backend, ShardedNativeMixedPrecisionPlugin)
+            assert isinstance(trainer.accelerator_backend.precision_plugin, ShardedNativeMixedPrecisionPlugin))
             raise SystemExit()
 
     model = BoringModel()

From a34cd15d16a42a0939748e8e97460a52c830b4d3 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 22 Jan 2021 18:10:25 +0100
Subject: [PATCH 111/157] reset optimizers

optimizers are not used in main process, so state would be wrong.
---
 .../accelerators/plugins/training_type/ddp_spawn.py           | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
index e2c61bfe6e3fd..e9e4fc364fa03 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
@@ -77,6 +77,8 @@ def set_world_ranks(self, process_idx):
 
     def start_training(self, trainer):
         mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,))
+        # reset optimizers, since main process is never used for training and thus does not have a valid optim state
+        trainer.optimizers = []
 
     def start_testing(self, trainer):
         mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,))
@@ -210,4 +212,4 @@ def model_to_device(self):
     def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
         if isinstance(output, torch.Tensor):
             output = sync_ddp_if_available(output, group, reduce_op)
-        return output
\ No newline at end of file
+        return output

From c95b06af23ae764ca445d52a63a44037f9b49bd0 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Sat, 23 Jan 2021 00:45:27 +0100
Subject: [PATCH 112/157] legacy

---
 pytorch_lightning/accelerators/{old => legacy}/__init__.py        | 0
 .../accelerators/{old => legacy}/accelerator_connector.py         | 0
 pytorch_lightning/accelerators/{old => legacy}/cpu_accelerator.py | 0
 pytorch_lightning/accelerators/{old => legacy}/ddp_accelerator.py | 0
 .../accelerators/{old => legacy}/ddp_hpc_accelerator.py           | 0
 .../accelerators/{old => legacy}/ddp_spawn_accelerator.py         | 0
 .../accelerators/{old => legacy}/horovod_accelerator.py           | 0
 pytorch_lightning/accelerators/{old => legacy}/tpu_accelerator.py | 0
 8 files changed, 0 insertions(+), 0 deletions(-)
 rename pytorch_lightning/accelerators/{old => legacy}/__init__.py (100%)
 rename pytorch_lightning/accelerators/{old => legacy}/accelerator_connector.py (100%)
 rename pytorch_lightning/accelerators/{old => legacy}/cpu_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{old => legacy}/ddp_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{old => legacy}/ddp_hpc_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{old => legacy}/ddp_spawn_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{old => legacy}/horovod_accelerator.py (100%)
 rename pytorch_lightning/accelerators/{old => legacy}/tpu_accelerator.py (100%)

diff --git a/pytorch_lightning/accelerators/old/__init__.py b/pytorch_lightning/accelerators/legacy/__init__.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/__init__.py
rename to pytorch_lightning/accelerators/legacy/__init__.py
diff --git a/pytorch_lightning/accelerators/old/accelerator_connector.py b/pytorch_lightning/accelerators/legacy/accelerator_connector.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/accelerator_connector.py
rename to pytorch_lightning/accelerators/legacy/accelerator_connector.py
diff --git a/pytorch_lightning/accelerators/old/cpu_accelerator.py b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/cpu_accelerator.py
rename to pytorch_lightning/accelerators/legacy/cpu_accelerator.py
diff --git a/pytorch_lightning/accelerators/old/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/ddp_accelerator.py
rename to pytorch_lightning/accelerators/legacy/ddp_accelerator.py
diff --git a/pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py
rename to pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
diff --git a/pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py
rename to pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
diff --git a/pytorch_lightning/accelerators/old/horovod_accelerator.py b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/horovod_accelerator.py
rename to pytorch_lightning/accelerators/legacy/horovod_accelerator.py
diff --git a/pytorch_lightning/accelerators/old/tpu_accelerator.py b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
similarity index 100%
rename from pytorch_lightning/accelerators/old/tpu_accelerator.py
rename to pytorch_lightning/accelerators/legacy/tpu_accelerator.py

From 9ff0c64f16194463dcc87f7773f8773fe81f56c6 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Sat, 23 Jan 2021 00:46:20 +0100
Subject: [PATCH 113/157] imports in accel

---
 pytorch_lightning/accelerators/accelerator.py     |  8 +++-----
 .../accelerators/accelerator_connector.py         | 11 ++++++-----
 pytorch_lightning/accelerators/cpu.py             |  2 +-
 pytorch_lightning/accelerators/gpu.py             |  3 ++-
 pytorch_lightning/accelerators/legacy/__init__.py |  3 ++-
 .../accelerators/legacy/ddp_accelerator.py        |  4 ++--
 .../accelerators/legacy/ddp_hpc_accelerator.py    |  6 +++---
 .../accelerators/legacy/ddp_spawn_accelerator.py  |  4 ++--
 .../accelerators/plugins/base_plugin.py           |  2 ++
 .../accelerators/plugins/precision/apex_amp.py    |  5 +++--
 .../accelerators/plugins/precision/mixed.py       |  3 ++-
 .../accelerators/plugins/precision/native_amp.py  |  4 +++-
 .../plugins/precision/precision_plugin.py         |  2 +-
 .../plugins/precision/sharded_native_amp.py       |  2 +-
 .../plugins/training_type/__init__.py             |  4 ++--
 .../accelerators/plugins/training_type/ddp.py     | 10 +++++-----
 .../plugins/training_type/ddp_spawn.py            | 15 +++++++--------
 .../accelerators/plugins/training_type/dp.py      |  2 ++
 .../accelerators/plugins/training_type/horovod.py |  7 ++++---
 .../plugins/training_type/parallel.py             |  2 ++
 .../plugins/training_type/single_device.py        |  1 +
 .../plugins/training_type/training_type_plugin.py |  4 ++--
 22 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 3a6c0e8f6bfbe..4834fdf39f0ae 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -1,19 +1,17 @@
-from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin
-from pytorch_lightning.utilities import AMPType
 from typing import Any
-import math
 
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.core import LightningModule
+from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin
 from pytorch_lightning.accelerators.plugins.precision import (
     ApexMixedPrecisionPlugin,
     MixedPrecisionPlugin,
     NativeMixedPrecisionPlugin,
     PrecisionPlugin,
 )
-
+from pytorch_lightning.core import LightningModule
+from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 
 
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 56fd5e16642e4..808472f4a4c73 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -13,23 +13,24 @@
 # limitations under the License.
 
 import os
+
 import torch
 
+from pytorch_lightning import _logger as log
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
-from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
-    DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin
 from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \
     PrecisionPlugin, ShardedNativeMixedPrecisionPlugin
+from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
+    DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin
+from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
+from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning import _logger as log
-from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
-from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
 
 try:
     import torch_xla
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index 820fab6d7d0f8..a39aace801993 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -1,5 +1,5 @@
-from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin
 from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 7b2cbe3627e0b..8084217019c0f 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -1,6 +1,7 @@
 import torch
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
 from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class GPUAccelerator(Accelerator):
diff --git a/pytorch_lightning/accelerators/legacy/__init__.py b/pytorch_lightning/accelerators/legacy/__init__.py
index d8bf7061de11f..d566b7301b788 100644
--- a/pytorch_lightning/accelerators/legacy/__init__.py
+++ b/pytorch_lightning/accelerators/legacy/__init__.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pytorch_lightning.accelerators.accelerator import Accelerator  # noqa: F401
 from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.ddp2_accelerator import DDP2Accelerator  # noqa: F401
 from pytorch_lightning.accelerators.ddp_accelerator import DDPAccelerator  # noqa: F401
@@ -23,3 +22,5 @@
 from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.tpu_accelerator import TPUAccelerator  # noqa: F401
+
+from pytorch_lightning.accelerators.accelerator import Accelerator  # noqa: F401
diff --git a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
index 56f6eaa2223a3..987eda50476f1 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
@@ -21,6 +21,8 @@
 import numpy as np
 import torch
 import torch.distributed as torch_distrib
+from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
@@ -28,8 +30,6 @@
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE, AMPType
 from pytorch_lightning.utilities.distributed import (
     all_gather_ddp_if_available,
diff --git a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
index cf6aad9999223..8df353b025378 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
@@ -14,8 +14,10 @@
 from typing import Any, List, Optional, Union
 
 import torch
-import torch.distributed as torch_distrib
 import torch.distributed as dist
+import torch.distributed as torch_distrib
+from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
@@ -23,8 +25,6 @@
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available
 
diff --git a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
index e23943e9262f8..33af749a229ee 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
@@ -18,6 +18,8 @@
 import torch
 import torch.distributed as torch_distrib
 import torch.multiprocessing as mp
+from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
@@ -25,8 +27,6 @@
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
diff --git a/pytorch_lightning/accelerators/plugins/base_plugin.py b/pytorch_lightning/accelerators/plugins/base_plugin.py
index 3ecfb48726f76..7c818db322916 100644
--- a/pytorch_lightning/accelerators/plugins/base_plugin.py
+++ b/pytorch_lightning/accelerators/plugins/base_plugin.py
@@ -1,6 +1,8 @@
 import contextlib
+
 import torch
 
+
 class Plugin(object):
 
     def connect(self, model: torch.nn.Module, *args, **kwargs):
diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
index 08b4fe7906732..967324b1a3490 100644
--- a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
+++ b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
@@ -1,10 +1,11 @@
-from contextlib import contextmanager
 from typing import List, Tuple
+
 import torch
 from torch.optim import Optimizer
+
+from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import AMPType, _APEX_AVAILABLE, rank_zero_warn
-from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
 
 if _APEX_AVAILABLE:
     from apex import amp
diff --git a/pytorch_lightning/accelerators/plugins/precision/mixed.py b/pytorch_lightning/accelerators/plugins/precision/mixed.py
index 1eb1ea18ebc23..f96a47f35c04c 100644
--- a/pytorch_lightning/accelerators/plugins/precision/mixed.py
+++ b/pytorch_lightning/accelerators/plugins/precision/mixed.py
@@ -1,5 +1,6 @@
-from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
+from pytorch_lightning.utilities import AMPType
+
 
 class MixedPrecisionPlugin(PrecisionPlugin):
     EPSILON = 1e-5
diff --git a/pytorch_lightning/accelerators/plugins/precision/native_amp.py b/pytorch_lightning/accelerators/plugins/precision/native_amp.py
index f233a43dfdd53..fad0d1f469c34 100644
--- a/pytorch_lightning/accelerators/plugins/precision/native_amp.py
+++ b/pytorch_lightning/accelerators/plugins/precision/native_amp.py
@@ -1,9 +1,11 @@
 from contextlib import contextmanager
+
 import torch
+
+from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
 
 
 class NativeMixedPrecisionPlugin(MixedPrecisionPlugin):
diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
index 6098edfde60b4..120fbcafbecf9 100644
--- a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
@@ -4,8 +4,8 @@
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.core import LightningModule
 from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
+from pytorch_lightning.core import LightningModule
 
 
 class PrecisionPlugin(Plugin):
diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
index 9df1e330bef47..969780dd1df7e 100644
--- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
+++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 from typing import Union, cast
 
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE
 from torch.optim import Optimizer
 
 from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin
+from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE
 
 if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
index 8ff2d65c4f6d7..152fdc68d552e 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
@@ -2,9 +2,9 @@
 from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin
 from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin
-from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin
-from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
 from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin
+from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
 from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin
 from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
index 4e865a959ae73..b314a230076b0 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
@@ -1,19 +1,19 @@
 import os
-import sys
 import subprocess
+import sys
 from time import sleep
-import numpy as np
 from typing import Any, Dict, Optional, Union
 
+import numpy as np
 import torch
 import torch.distributed as torch_distrib
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.utilities import _HYDRA_AVAILABLE
-from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.utilities import _HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only, sync_ddp_if_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
index e9e4fc364fa03..f572f9af36f06 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
@@ -1,22 +1,21 @@
-import re
 import os
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+import re
 from typing import Any, Dict, Optional, Union
-import torch
 
-import torch.multiprocessing as mp
+import torch
 import torch.distributed as torch_distrib
+import torch.multiprocessing as mp
 
+from pytorch_lightning import _logger as log
+from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.distributed.dist import LightningDistributed
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
-from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only
-from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn
 from pytorch_lightning.utilities.seed import seed_everything
 
-from pytorch_lightning import _logger as log
-
 if torch.distributed.is_available():
     from torch.distributed import ReduceOp
 else:
diff --git a/pytorch_lightning/accelerators/plugins/training_type/dp.py b/pytorch_lightning/accelerators/plugins/training_type/dp.py
index 0c50d077633af..d77aa52fc700c 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/dp.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/dp.py
@@ -1,10 +1,12 @@
 from typing import List
 
 import torch
+
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.overrides.data_parallel import LightningDataParallel
 
+
 class DataParallelPlugin(ParallelPlugin):
 
     def __init__(self, parallel_devices: List[torch.device]):
diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/accelerators/plugins/training_type/horovod.py
index fee77f762fde1..eb2edd2f3e414 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/horovod.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/horovod.py
@@ -1,12 +1,13 @@
 from contextlib import ExitStack
-from pytorch_lightning.utilities.distributed import rank_zero_only
 from typing import Any, List, Optional, Union
 
 import torch
+from torch.optim.lr_scheduler import _LRScheduler
+
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
-from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
 from pytorch_lightning.core.optimizer import LightningOptimizer
-from torch.optim.lr_scheduler import _LRScheduler
+from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
+from pytorch_lightning.utilities.distributed import rank_zero_only
 
 if _HOROVOD_AVAILABLE:
     import horovod.torch as hvd
diff --git a/pytorch_lightning/accelerators/plugins/training_type/parallel.py b/pytorch_lightning/accelerators/plugins/training_type/parallel.py
index fd366f677b55f..865e7e6b4bd1c 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/parallel.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/parallel.py
@@ -1,7 +1,9 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import List, Optional
+
 import torch
+
 from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core import LightningModule
diff --git a/pytorch_lightning/accelerators/plugins/training_type/single_device.py b/pytorch_lightning/accelerators/plugins/training_type/single_device.py
index 2e674ef87fbb4..200072ee82651 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/single_device.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/single_device.py
@@ -1,4 +1,5 @@
 import torch
+
 from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
 
 
diff --git a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
index 94d4dbf9d3409..c5e400494e82c 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
@@ -1,12 +1,12 @@
 import os
-
 from abc import ABC, abstractmethod
 from typing import Optional
+
 import torch
 
+from pytorch_lightning import _logger as log
 from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
 
-from pytorch_lightning import _logger as log
 
 class TrainingTypePlugin(Plugin, ABC):
     def __init__(self):

From 67d4e47281942e2a79d279a6f6774843c6ab1f16 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Sat, 23 Jan 2021 00:46:45 +0100
Subject: [PATCH 114/157] legacy2

---
 .../cluster_environments/cluster_environment.py        |  2 +-
 pytorch_lightning/plugins/{old => legacy}/__init__.py  |  0
 pytorch_lightning/plugins/{old => legacy}/apex.py      |  2 +-
 .../plugins/{old => legacy}/ddp_plugin.py              |  2 +-
 .../plugins/{old => legacy}/ddp_sequential_plugin.py   |  2 +-
 .../plugins/{old => legacy}/native_amp.py              |  2 +-
 pytorch_lightning/plugins/{old => legacy}/plugin.py    |  0
 .../plugins/{old => legacy}/plugin_connector.py        | 10 +++++-----
 .../plugins/{old => legacy}/precision_plugin.py        |  2 +-
 .../plugins/{old => legacy}/rpc_plugin.py              |  2 +-
 .../{old => legacy}/sharded_native_amp_plugin.py       |  2 +-
 .../plugins/{old => legacy}/sharded_plugin.py          |  4 ++--
 .../trainer/connectors/precision_connector.py          |  4 ++--
 pytorch_lightning/trainer/trainer.py                   |  2 +-
 tests/plugins/test_plugin_properties.py                |  2 +-
 15 files changed, 19 insertions(+), 19 deletions(-)
 rename pytorch_lightning/plugins/{old => legacy}/__init__.py (100%)
 rename pytorch_lightning/plugins/{old => legacy}/apex.py (98%)
 rename pytorch_lightning/plugins/{old => legacy}/ddp_plugin.py (99%)
 rename pytorch_lightning/plugins/{old => legacy}/ddp_sequential_plugin.py (99%)
 rename pytorch_lightning/plugins/{old => legacy}/native_amp.py (97%)
 rename pytorch_lightning/plugins/{old => legacy}/plugin.py (100%)
 rename pytorch_lightning/plugins/{old => legacy}/plugin_connector.py (95%)
 rename pytorch_lightning/plugins/{old => legacy}/precision_plugin.py (95%)
 rename pytorch_lightning/plugins/{old => legacy}/rpc_plugin.py (98%)
 rename pytorch_lightning/plugins/{old => legacy}/sharded_native_amp_plugin.py (94%)
 rename pytorch_lightning/plugins/{old => legacy}/sharded_plugin.py (95%)

diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py
index 8652d701dbf83..41af4fe84c7f0 100644
--- a/pytorch_lightning/cluster_environments/cluster_environment.py
+++ b/pytorch_lightning/cluster_environments/cluster_environment.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pytorch_lightning.plugins.old.plugin import LightningPlugin
+from pytorch_lightning.plugins.legacy.plugin import LightningPlugin
 
 
 class ClusterEnvironment(LightningPlugin):
diff --git a/pytorch_lightning/plugins/old/__init__.py b/pytorch_lightning/plugins/legacy/__init__.py
similarity index 100%
rename from pytorch_lightning/plugins/old/__init__.py
rename to pytorch_lightning/plugins/legacy/__init__.py
diff --git a/pytorch_lightning/plugins/old/apex.py b/pytorch_lightning/plugins/legacy/apex.py
similarity index 98%
rename from pytorch_lightning/plugins/old/apex.py
rename to pytorch_lightning/plugins/legacy/apex.py
index d917924eb0960..d8562c6a70d71 100644
--- a/pytorch_lightning/plugins/old/apex.py
+++ b/pytorch_lightning/plugins/legacy/apex.py
@@ -17,7 +17,7 @@
 from torch.optim.optimizer import Optimizer
 
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin
+from pytorch_lightning.plugins.legacy.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 
diff --git a/pytorch_lightning/plugins/old/ddp_plugin.py b/pytorch_lightning/plugins/legacy/ddp_plugin.py
similarity index 99%
rename from pytorch_lightning/plugins/old/ddp_plugin.py
rename to pytorch_lightning/plugins/legacy/ddp_plugin.py
index 360479de5a665..24455bc873919 100644
--- a/pytorch_lightning/plugins/old/ddp_plugin.py
+++ b/pytorch_lightning/plugins/legacy/ddp_plugin.py
@@ -22,7 +22,7 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.data_parallel import LightningDistributedModule, prepare_for_backward
-from pytorch_lightning.plugins.old.plugin import LightningPlugin
+from pytorch_lightning.plugins.legacy.plugin import LightningPlugin
 from pytorch_lightning.utilities import DeviceType
 
 
diff --git a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py b/pytorch_lightning/plugins/legacy/ddp_sequential_plugin.py
similarity index 99%
rename from pytorch_lightning/plugins/old/ddp_sequential_plugin.py
rename to pytorch_lightning/plugins/legacy/ddp_sequential_plugin.py
index dc39d648d2f13..a80f3ef7c795f 100644
--- a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py
+++ b/pytorch_lightning/plugins/legacy/ddp_sequential_plugin.py
@@ -21,7 +21,7 @@
 
 from pytorch_lightning import LightningModule
 from pytorch_lightning import _logger as log
-from pytorch_lightning.plugins.old.rpc_plugin import RPCPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
diff --git a/pytorch_lightning/plugins/old/native_amp.py b/pytorch_lightning/plugins/legacy/native_amp.py
similarity index 97%
rename from pytorch_lightning/plugins/old/native_amp.py
rename to pytorch_lightning/plugins/legacy/native_amp.py
index 832d6acc672b4..d691134f0b4da 100644
--- a/pytorch_lightning/plugins/old/native_amp.py
+++ b/pytorch_lightning/plugins/legacy/native_amp.py
@@ -16,7 +16,7 @@
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin
+from pytorch_lightning.plugins.legacy.precision_plugin import PrecisionPlugin
 
 
 class NativeAMPPlugin(PrecisionPlugin):
diff --git a/pytorch_lightning/plugins/old/plugin.py b/pytorch_lightning/plugins/legacy/plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/old/plugin.py
rename to pytorch_lightning/plugins/legacy/plugin.py
diff --git a/pytorch_lightning/plugins/old/plugin_connector.py b/pytorch_lightning/plugins/legacy/plugin_connector.py
similarity index 95%
rename from pytorch_lightning/plugins/old/plugin_connector.py
rename to pytorch_lightning/plugins/legacy/plugin_connector.py
index 77dae1229743e..c6af30613c39a 100644
--- a/pytorch_lightning/plugins/old/plugin_connector.py
+++ b/pytorch_lightning/plugins/legacy/plugin_connector.py
@@ -15,11 +15,11 @@
 from typing import List, Optional, Union
 
 from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.plugins.old.apex import ApexPlugin
-from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin
-from pytorch_lightning.plugins.old.plugin import LightningPlugin
-from pytorch_lightning.plugins.old.sharded_plugin import DDPShardedPlugin
+from pytorch_lightning.plugins.legacy.apex import ApexPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.legacy.plugin import LightningPlugin
+from pytorch_lightning.plugins.legacy.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import AMPType, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
diff --git a/pytorch_lightning/plugins/old/precision_plugin.py b/pytorch_lightning/plugins/legacy/precision_plugin.py
similarity index 95%
rename from pytorch_lightning/plugins/old/precision_plugin.py
rename to pytorch_lightning/plugins/legacy/precision_plugin.py
index 69d8e3670678d..1041e9d6b0faf 100644
--- a/pytorch_lightning/plugins/old/precision_plugin.py
+++ b/pytorch_lightning/plugins/legacy/precision_plugin.py
@@ -15,7 +15,7 @@
 
 from torch.optim import Optimizer
 
-from pytorch_lightning.plugins.old.plugin import LightningPlugin
+from pytorch_lightning.plugins.legacy.plugin import LightningPlugin
 
 
 class PrecisionPlugin(LightningPlugin):
diff --git a/pytorch_lightning/plugins/old/rpc_plugin.py b/pytorch_lightning/plugins/legacy/rpc_plugin.py
similarity index 98%
rename from pytorch_lightning/plugins/old/rpc_plugin.py
rename to pytorch_lightning/plugins/legacy/rpc_plugin.py
index 4445b1d35970e..89f60f1d783c8 100644
--- a/pytorch_lightning/plugins/old/rpc_plugin.py
+++ b/pytorch_lightning/plugins/legacy/rpc_plugin.py
@@ -18,7 +18,7 @@
 import torch
 
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
 from pytorch_lightning.utilities import _RPC_AVAILABLE
 
 DEFAULT_RPC_TIMEOUT_SEC = 60.
diff --git a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/legacy/sharded_native_amp_plugin.py
similarity index 94%
rename from pytorch_lightning/plugins/old/sharded_native_amp_plugin.py
rename to pytorch_lightning/plugins/legacy/sharded_native_amp_plugin.py
index c29821dcd8a8d..f507c8c3bd6c0 100644
--- a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/legacy/sharded_native_amp_plugin.py
@@ -15,7 +15,7 @@
 
 from torch.optim import Optimizer
 
-from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 
 if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
diff --git a/pytorch_lightning/plugins/old/sharded_plugin.py b/pytorch_lightning/plugins/legacy/sharded_plugin.py
similarity index 95%
rename from pytorch_lightning/plugins/old/sharded_plugin.py
rename to pytorch_lightning/plugins/legacy/sharded_plugin.py
index 19e0859587585..bf008e34fc3ca 100644
--- a/pytorch_lightning/plugins/old/sharded_plugin.py
+++ b/pytorch_lightning/plugins/legacy/sharded_plugin.py
@@ -15,8 +15,8 @@
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
-from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.old.sharded_native_amp_plugin import ShardedNativeAMPPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.sharded_native_amp_plugin import ShardedNativeAMPPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, AMPType, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
diff --git a/pytorch_lightning/trainer/connectors/precision_connector.py b/pytorch_lightning/trainer/connectors/precision_connector.py
index af8db214eff9d..f3c9de66a811d 100644
--- a/pytorch_lightning/trainer/connectors/precision_connector.py
+++ b/pytorch_lightning/trainer/connectors/precision_connector.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.plugins.old.apex import ApexPlugin
-from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.legacy.apex import ApexPlugin
+from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, AMPType, rank_zero_warn
 
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 5bf2fdcea7991..11e440bf0f52d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -24,7 +24,7 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.plugins.old.plugin_connector import PluginConnector
+from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector
 from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
diff --git a/tests/plugins/test_plugin_properties.py b/tests/plugins/test_plugin_properties.py
index ef87a79d4bb5c..1a6556c0f76ff 100644
--- a/tests/plugins/test_plugin_properties.py
+++ b/tests/plugins/test_plugin_properties.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.old.plugin_connector import LightningCustomPlugins, PluginConnector
+from pytorch_lightning.plugins.legacy.plugin_connector import LightningCustomPlugins, PluginConnector
 
 
 def test_available_plugins_trainer():

From 577b00df62cc2b3cbee99a254e44a03578a9d489 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Sat, 23 Jan 2021 00:47:27 +0100
Subject: [PATCH 115/157] trainer imports

---
 pytorch_lightning/trainer/trainer.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 11e440bf0f52d..a6b35a468e48d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -15,7 +15,6 @@
 """Trainer to automate the training."""
 
 import os
-from pytorch_lightning.core.memory import ModelSummary
 import warnings
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Union
@@ -24,15 +23,14 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector
-from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes
-from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
-from pytorch_lightning.callbacks import Callback, ModelCheckpoint
+from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.core.memory import ModelSummary
 from pytorch_lightning.core.step_result import EvalResult, Result
 from pytorch_lightning.loggers import LightningLoggerBase
+from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector
 from pytorch_lightning.profiler import BaseProfiler
 from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
 from pytorch_lightning.trainer.configuration_validator import ConfigValidator
@@ -44,7 +42,6 @@
 from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
 from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
 from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
-from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
 from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
 from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
 from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
@@ -59,15 +56,6 @@
 from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
-from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
-from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
-from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
-from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
-from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
-from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
-from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
-from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
-from pytorch_lightning import _logger as log
 from pytorch_lightning.tuner.tuning import Tuner
 from pytorch_lightning.utilities import DeviceType, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import load as pl_load
@@ -75,8 +63,6 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import recursive_detach
 from pytorch_lightning.utilities.model_utils import is_overridden
-from pytorch_lightning.trainer.properties import TrainerProperties
-from pytorch_lightning.accelerators.accelerator import Accelerator
 
 # warnings to ignore in trainer
 warnings.filterwarnings(

From aa4858b070bca27f0c21f1128c0fc1dc734e1958 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 25 Jan 2021 03:19:54 +0100
Subject: [PATCH 116/157] fix import errors after rebase

---
 pytorch_lightning/trainer/trainer.py        | 1 +
 tests/deprecated_api/test_remove_1-4.py     | 2 +-
 tests/models/test_sync_batchnorm.py         | 2 +-
 tests/plugins/test_amp_plugin.py            | 2 +-
 tests/plugins/test_apex_plugin.py           | 2 +-
 tests/plugins/test_ddp_plugin.py            | 4 ++--
 tests/plugins/test_ddp_sequential_plugin.py | 2 +-
 tests/plugins/test_plugin.py                | 4 ++--
 tests/plugins/test_rpc_plugin.py            | 2 +-
 tests/plugins/test_sharded_plugin.py        | 2 +-
 10 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a6b35a468e48d..584dae3437ff2 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -23,6 +23,7 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import _logger as log
+from pytorch_lightning.accelerators import Accelerator
 from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.core.datamodule import LightningDataModule
diff --git a/tests/deprecated_api/test_remove_1-4.py b/tests/deprecated_api/test_remove_1-4.py
index 00f02076fccef..fc3b201d88a74 100644
--- a/tests/deprecated_api/test_remove_1-4.py
+++ b/tests/deprecated_api/test_remove_1-4.py
@@ -19,7 +19,7 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
 from tests.base import BoringModel
 from tests.deprecated_api import _soft_unimport_module
 
diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py
index fe00acff62624..444067d82bd9e 100644
--- a/tests/models/test_sync_batchnorm.py
+++ b/tests/models/test_sync_batchnorm.py
@@ -17,7 +17,7 @@
 import torch.nn.functional as F
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import FLOAT16_EPSILON
 from tests.base.datamodules import MNISTDataModule
diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py
index 1e98740f99d62..48833e292564a 100644
--- a/tests/plugins/test_amp_plugin.py
+++ b/tests/plugins/test_amp_plugin.py
@@ -6,7 +6,7 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin
 from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
 from tests.base.boring_model import BoringModel
 
diff --git a/tests/plugins/test_apex_plugin.py b/tests/plugins/test_apex_plugin.py
index df6d76547bcf6..1f452933ec6a0 100644
--- a/tests/plugins/test_apex_plugin.py
+++ b/tests/plugins/test_apex_plugin.py
@@ -5,7 +5,7 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.apex import ApexPlugin
+from pytorch_lightning.plugins.legacy.apex import ApexPlugin
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from tests.base.boring_model import BoringModel
 
diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py
index fe8fc555ba06c..4bdaad74b67ab 100644
--- a/tests/plugins/test_ddp_plugin.py
+++ b/tests/plugins/test_ddp_plugin.py
@@ -6,8 +6,8 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base.boring_model import BoringModel
diff --git a/tests/plugins/test_ddp_sequential_plugin.py b/tests/plugins/test_ddp_sequential_plugin.py
index 460d195f6723b..ddb1bd6768e29 100644
--- a/tests/plugins/test_ddp_sequential_plugin.py
+++ b/tests/plugins/test_ddp_sequential_plugin.py
@@ -20,7 +20,7 @@
 from torch import nn
 
 from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+from pytorch_lightning.plugins.legacy.ddp_sequential_plugin import DDPSequentialPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base.boring_model import RandomDataset
diff --git a/tests/plugins/test_plugin.py b/tests/plugins/test_plugin.py
index 05789596879b4..4b01b4402611d 100644
--- a/tests/plugins/test_plugin.py
+++ b/tests/plugins/test_plugin.py
@@ -17,8 +17,8 @@
 import pytest
 
 from pytorch_lightning import Callback, Trainer
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base.boring_model import BoringModel
diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py
index a28cd4b50e4f4..77937c16058dc 100644
--- a/tests/plugins/test_rpc_plugin.py
+++ b/tests/plugins/test_rpc_plugin.py
@@ -7,7 +7,7 @@
 
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import _RPC_AVAILABLE
 from tests.base.boring_model import BoringModel
 
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index ac20cd68e36d5..0bd13db5a9052 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -74,7 +74,7 @@ def test_ddp_choice_sharded_amp(tmpdir, accelerator):
 
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.precision_plugin, ShardedNativeMixedPrecisionPlugin))
+            assert isinstance(trainer.accelerator_backend.precision_plugin, ShardedNativeMixedPrecisionPlugin)
             raise SystemExit()
 
     model = BoringModel()

From f81a44f22a40d5433e7fc41b5f24331703a5059c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 25 Jan 2021 03:33:00 +0100
Subject: [PATCH 117/157] move hook to new setup location

---
 pytorch_lightning/trainer/trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 584dae3437ff2..96f4eaf430101 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -500,7 +500,6 @@ def fit(
         # SET UP TRAINING
         # ----------------------------
         # self.accelerator_backend = self.accelerator_connector.select_accelerator()
-        self.call_hook("on_before_accelerator_backend_setup", model)
         self.accelerator_backend.setup(self, model)
         self.train_loop.setup_training(model)
 
@@ -511,6 +510,8 @@ def fit(
         self.call_hook("on_fit_start")
 
         # plugin will setup training (e.g. ddp will launch child processes)
+        # TODO: the old setup is now called "pre_training", where should this hook be called now?
+        self.call_hook("on_before_accelerator_backend_setup", model)
         self.training_type_plugin.pre_training()
 
         self.call_setup_hook(self.lightning_module)

From a2856650291de3b1d0befbd6acc8547029c32b81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 25 Jan 2021 03:44:05 +0100
Subject: [PATCH 118/157] provide unwrapping logic

---
 .../accelerators/plugins/training_type/ddp.py            | 4 ++--
 .../accelerators/plugins/training_type/ddp_spawn.py      | 4 ++--
 pytorch_lightning/overrides/data_parallel.py             | 9 +++++++++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
index b314a230076b0..08f27f3d9e15c 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py
@@ -12,7 +12,7 @@
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only, sync_ddp_if_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -60,7 +60,7 @@ def root_device(self):
     @property
     def lightning_module(self):
         # the model may not be wrapped with DistributedDataParallel if calling this too early
-        return getattr(self._model, "module", self._model)
+        return unwrap_lightning_module(self._model)
 
     @property
     def distributed_sampler_kwargs(self):
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
index f572f9af36f06..622ac2a726998 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
@@ -10,7 +10,7 @@
 from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
 from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
 from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only
 from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn
@@ -52,7 +52,7 @@ def root_device(self):
     @property
     def lightning_module(self):
         # the model may not be wrapped with DistributedDataParallel if calling this too early
-        return getattr(self._model, "module", self._model)
+        return unwrap_lightning_module(self._model)
 
     @property
     def distributed_sampler_kwargs(self):
diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py
index 69676cf77e079..84475a755065a 100644
--- a/pytorch_lightning/overrides/data_parallel.py
+++ b/pytorch_lightning/overrides/data_parallel.py
@@ -62,6 +62,15 @@ def get_a_var(obj):  # pragma: no-cover
 warning_cache = WarningCache()
 
 
+def unwrap_lightning_module(wrapped_model):
+    model = wrapped_model
+    if isinstance(model, (LightningDistributedDataParallel, LightningDataParallel)):
+        model = model.module
+    if isinstance(model, LightningDistributedModule):
+        model = model.module
+    return model
+
+
 class LightningDataParallel(DataParallel):
     """
     Override the forward call in lightning so it goes to training and validation step respectively

From bf78d7048315ff735c70c9cfe8cfbdd0770a0b05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 25 Jan 2021 03:50:57 +0100
Subject: [PATCH 119/157] fix trainer callback system

---
 tests/callbacks/test_callbacks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index f3e1dabfb6e59..e9bb7452a1abb 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -55,8 +55,8 @@ def test_trainer_callback_system(torch_save):
     assert callback_mock.method_calls == [
         call.on_init_start(trainer),
         call.on_init_end(trainer),
-        call.on_before_accelerator_backend_setup(trainer, model),
         call.on_fit_start(trainer, model),
+        call.on_before_accelerator_backend_setup(trainer, model),
         call.setup(trainer, model, 'fit'),
         call.on_pretrain_routine_start(trainer, model),
         call.on_pretrain_routine_end(trainer, model),
@@ -111,6 +111,7 @@ def test_trainer_callback_system(torch_save):
         call.on_init_start(trainer),
         call.on_init_end(trainer),
         call.on_fit_start(trainer, model),
+        call.on_before_accelerator_backend_setup(trainer, model),
         call.setup(trainer, model, 'test'),
         # call.on_pretrain_routine_start(trainer, model),
         # call.on_pretrain_routine_end(trainer, model),

From 34947cf0840909bdff0e955dbdac315c89868370 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 25 Jan 2021 06:04:09 +0100
Subject: [PATCH 120/157] added ddp2 implementation

---
 .../plugins/training_type/ddp2.py             | 40 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py
index 078dfe6cd6ec1..ff55ef72e0f83 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py
+++ b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py
@@ -1,5 +1,41 @@
+import torch
+
 from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.core.step_result import Result
+
 
-# TODO: DDP2
 class DDP2Plugin(DDPPlugin):
-    pass
\ No newline at end of file
+
+    def setup(self, model):
+        self._model = model
+        # set the task idx
+        self.task_idx = self.cluster_environment.local_rank()
+        # the difference to DDP is that we don't call children processes here
+
+    def reduce(self, output, *args, **kwargs):
+        if isinstance(output, Result):
+            output.dp_reduce()
+
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
+
+        return output
+
+    @property
+    def root_device(self):
+        return self.parallel_devices[0]
+
+    def model_to_device(self):
+        # no need to do anything when model is wrapped in torch.nn.DataParallel
+        pass
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(num_replicas=self.num_nodes, rank=self.global_rank)
+        return distributed_sampler_kwargs
+
+    def set_world_ranks(self):
+        self.local_rank = self.task_idx
+        self.node_rank = self.cluster_environment.node_rank()
+        self.global_rank = self.node_rank
+        self.world_size = self.num_nodes

From 49bec5391ab019bef1301bb05bb8546e7df463bf Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Mon, 25 Jan 2021 10:15:05 +0100
Subject: [PATCH 121/157] fix imports .legacy

---
 .../basic_examples/conv_sequential_example.py |  2 +-
 .../accelerators/legacy/__init__.py           | 24 +++++++++----------
 .../accelerators/legacy/cpu_accelerator.py    | 13 ++++++----
 .../accelerators/legacy/ddp_accelerator.py    |  8 +++----
 .../legacy/ddp_hpc_accelerator.py             |  9 +++----
 .../legacy/ddp_spawn_accelerator.py           |  8 +++----
 .../legacy/horovod_accelerator.py             |  4 ++--
 .../accelerators/legacy/tpu_accelerator.py    |  3 ++-
 8 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py
index 84efb4bea7670..38e077071d59e 100644
--- a/pl_examples/basic_examples/conv_sequential_example.py
+++ b/pl_examples/basic_examples/conv_sequential_example.py
@@ -32,7 +32,7 @@
 from pl_examples import cli_lightning_logo
 from pytorch_lightning import Trainer
 from pytorch_lightning.metrics.functional import accuracy
-from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
+from pytorch_lightning.plugins.legacy.ddp_sequential_plugin import DDPSequentialPlugin
 from pytorch_lightning.utilities import _BOLTS_AVAILABLE, _FAIRSCALE_PIPE_AVAILABLE
 
 if _BOLTS_AVAILABLE:
diff --git a/pytorch_lightning/accelerators/legacy/__init__.py b/pytorch_lightning/accelerators/legacy/__init__.py
index d566b7301b788..a388f522d63bf 100644
--- a/pytorch_lightning/accelerators/legacy/__init__.py
+++ b/pytorch_lightning/accelerators/legacy/__init__.py
@@ -11,16 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.ddp2_accelerator import DDP2Accelerator  # noqa: F401
-from pytorch_lightning.accelerators.ddp_accelerator import DDPAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.ddp_cpu_hpc_accelerator import DDPCPUHPCAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.ddp_cpu_spawn_accelerator import DDPCPUSpawnAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.ddp_spawn_accelerator import DDPSpawnAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.dp_accelerator import DataParallelAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.tpu_accelerator import TPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.cpu_accelerator import CPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.ddp2_accelerator import DDP2Accelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.ddp_accelerator import DDPAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.ddp_cpu_hpc_accelerator import DDPCPUHPCAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.ddp_cpu_spawn_accelerator import DDPCPUSpawnAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.ddp_hpc_accelerator import DDPHPCAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.ddp_spawn_accelerator import DDPSpawnAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.dp_accelerator import DataParallelAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.horovod_accelerator import HorovodAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.tpu_accelerator import TPUAccelerator  # noqa: F401
 
-from pytorch_lightning.accelerators.accelerator import Accelerator  # noqa: F401
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator  # noqa: F401
diff --git a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py
index 7c80a4a30d223..efe14ff6b9b4b 100644
--- a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py
@@ -15,9 +15,10 @@
 
 import torch
 
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities.distributed import ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -79,10 +80,12 @@ def validation_step(self, args):
     def test_step(self, args):
         return self._step(self.trainer.model.test_step, args)
 
-    def sync_tensor(self,
-                    tensor: Union[torch.Tensor],
-                    group: Optional[Any] = None,
-                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+    def sync_tensor(
+            self,
+            tensor: Union[torch.Tensor],
+            group: Optional[Any] = None,
+            reduce_op: Optional[Union[ReduceOp, str]] = None,
+    ) -> torch.Tensor:
         return tensor
 
     @property
diff --git a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
index 987eda50476f1..729ae2ec2ba94 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
@@ -21,12 +21,12 @@
 import numpy as np
 import torch
 import torch.distributed as torch_distrib
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
@@ -35,7 +35,7 @@
     all_gather_ddp_if_available,
     find_free_network_port,
     rank_zero_only,
-    sync_ddp_if_available,
+    sync_ddp_if_available, ReduceOp,
 )
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
diff --git a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
index 8df353b025378..58fd60ac18a69 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
@@ -16,17 +16,18 @@
 import torch
 import torch.distributed as dist
 import torch.distributed as torch_distrib
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available
+from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available, \
+    ReduceOp
 
 
 class DDPHPCAccelerator(Accelerator):
diff --git a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
index 33af749a229ee..39871a6c6d344 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
@@ -18,12 +18,12 @@
 import torch
 import torch.distributed as torch_distrib
 import torch.multiprocessing as mp
-from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed import LightningDistributed
@@ -35,7 +35,7 @@
     find_free_network_port,
     rank_zero_only,
     rank_zero_warn,
-    sync_ddp_if_available,
+    sync_ddp_if_available, ReduceOp,
 )
 from pytorch_lightning.utilities.seed import seed_everything
 
diff --git a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py
index 150be86210866..7d41dd990e7ad 100644
--- a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py
@@ -17,10 +17,10 @@
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
 
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, AMPType, DeviceType
-from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp
 
 if _HOROVOD_AVAILABLE:
     import horovod.torch as hvd
diff --git a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
index 66fc236a2a775..158978cbcbba9 100644
--- a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
@@ -21,7 +21,7 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
+from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import (
@@ -32,6 +32,7 @@
     rank_zero_warn,
 )
 from pytorch_lightning.utilities.cloud_io import atomic_save
+from pytorch_lightning.utilities.distributed import ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _TPU_AVAILABLE:

From ba1c986a32d744b406b0bd09f5b3c245a003ce6e Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Mon, 25 Jan 2021 10:16:46 +0100
Subject: [PATCH 122/157] move plugins

---
 pytorch_lightning/accelerators/accelerator.py          |  4 ++--
 .../accelerators/accelerator_connector.py              |  4 ++--
 pytorch_lightning/accelerators/cpu.py                  |  2 +-
 pytorch_lightning/accelerators/plugins/__init__.py     |  3 ---
 .../accelerators/plugins/precision/__init__.py         |  5 -----
 .../accelerators/plugins/training_type/__init__.py     | 10 ----------
 pytorch_lightning/plugins/__init__.py                  |  4 +++-
 .../{accelerators => }/plugins/base_plugin.py          |  0
 pytorch_lightning/plugins/precision/__init__.py        |  5 +++++
 .../{accelerators => }/plugins/precision/apex_amp.py   |  2 +-
 .../{accelerators => }/plugins/precision/mixed.py      |  2 +-
 .../{accelerators => }/plugins/precision/native_amp.py |  2 +-
 .../plugins/precision/precision_plugin.py              |  2 +-
 .../plugins/precision/sharded_native_amp.py            |  2 +-
 pytorch_lightning/plugins/training_type/__init__.py    | 10 ++++++++++
 .../{accelerators => }/plugins/training_type/ddp.py    |  2 +-
 .../{accelerators => }/plugins/training_type/ddp2.py   |  2 +-
 .../plugins/training_type/ddp_spawn.py                 |  2 +-
 .../{accelerators => }/plugins/training_type/dp.py     |  2 +-
 .../plugins/training_type/horovod.py                   |  2 +-
 .../plugins/training_type/parallel.py                  |  2 +-
 .../plugins/training_type/sharded.py                   |  2 +-
 .../plugins/training_type/sharded_spawn.py             |  2 +-
 .../plugins/training_type/single_device.py             |  2 +-
 .../plugins/training_type/training_type_plugin.py      |  2 +-
 pytorch_lightning/trainer/training_loop.py             |  2 +-
 tests/backends/test_accelerator_connector.py           |  4 ++--
 tests/plugins/test_sharded_plugin.py                   |  2 +-
 28 files changed, 42 insertions(+), 43 deletions(-)
 delete mode 100644 pytorch_lightning/accelerators/plugins/__init__.py
 delete mode 100644 pytorch_lightning/accelerators/plugins/precision/__init__.py
 delete mode 100644 pytorch_lightning/accelerators/plugins/training_type/__init__.py
 rename pytorch_lightning/{accelerators => }/plugins/base_plugin.py (100%)
 create mode 100644 pytorch_lightning/plugins/precision/__init__.py
 rename pytorch_lightning/{accelerators => }/plugins/precision/apex_amp.py (97%)
 rename pytorch_lightning/{accelerators => }/plugins/precision/mixed.py (62%)
 rename pytorch_lightning/{accelerators => }/plugins/precision/native_amp.py (94%)
 rename pytorch_lightning/{accelerators => }/plugins/precision/precision_plugin.py (97%)
 rename pytorch_lightning/{accelerators => }/plugins/precision/sharded_native_amp.py (92%)
 create mode 100644 pytorch_lightning/plugins/training_type/__init__.py
 rename pytorch_lightning/{accelerators => }/plugins/training_type/ddp.py (99%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/ddp2.py (93%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/ddp_spawn.py (98%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/dp.py (93%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/horovod.py (98%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/parallel.py (96%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/sharded.py (96%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/sharded_spawn.py (95%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/single_device.py (89%)
 rename pytorch_lightning/{accelerators => }/plugins/training_type/training_type_plugin.py (97%)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 4834fdf39f0ae..711ad367915ad 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -3,8 +3,8 @@
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin
-from pytorch_lightning.accelerators.plugins.precision import (
+from pytorch_lightning.plugins import TrainingTypePlugin, HorovodPlugin
+from pytorch_lightning.plugins .precision import (
     ApexMixedPrecisionPlugin,
     MixedPrecisionPlugin,
     NativeMixedPrecisionPlugin,
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 808472f4a4c73..baf14c4146aed 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -20,9 +20,9 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
-from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \
+from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \
     PrecisionPlugin, ShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
+from pytorch_lightning.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
     DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin
 from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
 from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index a39aace801993..57dc5bf6a8bbf 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -1,5 +1,5 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin
+from pytorch_lightning.plugins import MixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
diff --git a/pytorch_lightning/accelerators/plugins/__init__.py b/pytorch_lightning/accelerators/plugins/__init__.py
deleted file mode 100644
index 119284ef33c76..0000000000000
--- a/pytorch_lightning/accelerators/plugins/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
-from pytorch_lightning.accelerators.plugins.precision import *
-from pytorch_lightning.accelerators.plugins.training_type import *
diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py
deleted file mode 100644
index 0c7265f4be29d..0000000000000
--- a/pytorch_lightning/accelerators/plugins/precision/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin
-from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
-from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin
-from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
-from pytorch_lightning.accelerators.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py
deleted file mode 100644
index 152fdc68d552e..0000000000000
--- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
-from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin
-from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
-from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin
-from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin
-from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
-from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin
-from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
-from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin
-from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index b416a9f56aebe..e023060d5b16a 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -1 +1,3 @@
-from pytorch_lightning.accelerators.plugins import *
\ No newline at end of file
+from pytorch_lightning.plugins.base_plugin import Plugin
+from pytorch_lightning.plugins.precision import *
+from pytorch_lightning.plugins.training_type import *
diff --git a/pytorch_lightning/accelerators/plugins/base_plugin.py b/pytorch_lightning/plugins/base_plugin.py
similarity index 100%
rename from pytorch_lightning/accelerators/plugins/base_plugin.py
rename to pytorch_lightning/plugins/base_plugin.py
diff --git a/pytorch_lightning/plugins/precision/__init__.py b/pytorch_lightning/plugins/precision/__init__.py
new file mode 100644
index 0000000000000..8220a1a890867
--- /dev/null
+++ b/pytorch_lightning/plugins/precision/__init__.py
@@ -0,0 +1,5 @@
+from pytorch_lightning.plugins .precision.apex_amp import ApexMixedPrecisionPlugin
+from pytorch_lightning.plugins .precision.mixed import MixedPrecisionPlugin
+from pytorch_lightning.plugins .precision.native_amp import NativeMixedPrecisionPlugin
+from pytorch_lightning.plugins .precision.precision_plugin import PrecisionPlugin
+from pytorch_lightning.plugins .precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py
similarity index 97%
rename from pytorch_lightning/accelerators/plugins/precision/apex_amp.py
rename to pytorch_lightning/plugins/precision/apex_amp.py
index 967324b1a3490..7ba75ca3d9aaa 100644
--- a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py
+++ b/pytorch_lightning/plugins/precision/apex_amp.py
@@ -3,7 +3,7 @@
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
+from pytorch_lightning.plugins .precision.mixed import MixedPrecisionPlugin
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import AMPType, _APEX_AVAILABLE, rank_zero_warn
 
diff --git a/pytorch_lightning/accelerators/plugins/precision/mixed.py b/pytorch_lightning/plugins/precision/mixed.py
similarity index 62%
rename from pytorch_lightning/accelerators/plugins/precision/mixed.py
rename to pytorch_lightning/plugins/precision/mixed.py
index f96a47f35c04c..dce279e660144 100644
--- a/pytorch_lightning/accelerators/plugins/precision/mixed.py
+++ b/pytorch_lightning/plugins/precision/mixed.py
@@ -1,4 +1,4 @@
-from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin
+from pytorch_lightning.plugins .precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities import AMPType
 
 
diff --git a/pytorch_lightning/accelerators/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py
similarity index 94%
rename from pytorch_lightning/accelerators/plugins/precision/native_amp.py
rename to pytorch_lightning/plugins/precision/native_amp.py
index fad0d1f469c34..885d37901d6ee 100644
--- a/pytorch_lightning/accelerators/plugins/precision/native_amp.py
+++ b/pytorch_lightning/plugins/precision/native_amp.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin
+from pytorch_lightning.plugins .precision.mixed import MixedPrecisionPlugin
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
similarity index 97%
rename from pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
rename to pytorch_lightning/plugins/precision/precision_plugin.py
index 120fbcafbecf9..31e94c612804c 100644
--- a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -4,7 +4,7 @@
 import torch
 from torch.optim import Optimizer
 
-from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
+from pytorch_lightning.plugins .base_plugin import Plugin
 from pytorch_lightning.core import LightningModule
 
 
diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/plugins/precision/sharded_native_amp.py
similarity index 92%
rename from pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
rename to pytorch_lightning/plugins/precision/sharded_native_amp.py
index 969780dd1df7e..d7e8ca0020091 100644
--- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py
+++ b/pytorch_lightning/plugins/precision/sharded_native_amp.py
@@ -15,7 +15,7 @@
 
 from torch.optim import Optimizer
 
-from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin
+from pytorch_lightning.plugins .precision.native_amp import NativeMixedPrecisionPlugin
 from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE
 
 if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
new file mode 100644
index 0000000000000..7109594600a04
--- /dev/null
+++ b/pytorch_lightning/plugins/training_type/__init__.py
@@ -0,0 +1,10 @@
+from pytorch_lightning.plugins .training_type.ddp import DDPPlugin
+from pytorch_lightning.plugins .training_type.ddp2 import DDP2Plugin
+from pytorch_lightning.plugins .training_type.ddp_spawn import DDPSpawnPlugin
+from pytorch_lightning.plugins .training_type.dp import DataParallelPlugin
+from pytorch_lightning.plugins .training_type.horovod import HorovodPlugin
+from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins .training_type.sharded import DDPShardedPlugin
+from pytorch_lightning.plugins .training_type.sharded_spawn import DDPSpawnShardedPlugin
+from pytorch_lightning.plugins .training_type.single_device import SingleDevicePlugin
+from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
similarity index 99%
rename from pytorch_lightning/accelerators/plugins/training_type/ddp.py
rename to pytorch_lightning/plugins/training_type/ddp.py
index 08f27f3d9e15c..06c0a5ce5f03b 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -9,7 +9,7 @@
 import torch.distributed as torch_distrib
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py b/pytorch_lightning/plugins/training_type/ddp2.py
similarity index 93%
rename from pytorch_lightning/accelerators/plugins/training_type/ddp2.py
rename to pytorch_lightning/plugins/training_type/ddp2.py
index ff55ef72e0f83..c693a004a39e0 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py
+++ b/pytorch_lightning/plugins/training_type/ddp2.py
@@ -1,6 +1,6 @@
 import torch
 
-from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.plugins .training_type.ddp import DDPPlugin
 from pytorch_lightning.core.step_result import Result
 
 
diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
similarity index 98%
rename from pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
rename to pytorch_lightning/plugins/training_type/ddp_spawn.py
index 622ac2a726998..80886d2555c21 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -7,7 +7,7 @@
 import torch.multiprocessing as mp
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
diff --git a/pytorch_lightning/accelerators/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
similarity index 93%
rename from pytorch_lightning/accelerators/plugins/training_type/dp.py
rename to pytorch_lightning/plugins/training_type/dp.py
index d77aa52fc700c..c168aa0a42d00 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin
 from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.overrides.data_parallel import LightningDataParallel
 
diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
similarity index 98%
rename from pytorch_lightning/accelerators/plugins/training_type/horovod.py
rename to pytorch_lightning/plugins/training_type/horovod.py
index eb2edd2f3e414..ca00b01b6f911 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -4,7 +4,7 @@
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
 
-from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
 from pytorch_lightning.utilities.distributed import rank_zero_only
diff --git a/pytorch_lightning/accelerators/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py
similarity index 96%
rename from pytorch_lightning/accelerators/plugins/training_type/parallel.py
rename to pytorch_lightning/plugins/training_type/parallel.py
index 865e7e6b4bd1c..8bc692b97b3ee 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/parallel.py
+++ b/pytorch_lightning/plugins/training_type/parallel.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
+from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py
similarity index 96%
rename from pytorch_lightning/accelerators/plugins/training_type/sharded.py
rename to pytorch_lightning/plugins/training_type/sharded.py
index 1ba54bf8419bb..fb24f8c73315d 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py
+++ b/pytorch_lightning/plugins/training_type/sharded.py
@@ -1,6 +1,6 @@
 from typing import Optional
 
-from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.plugins .training_type.ddp import DDPPlugin
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py
similarity index 95%
rename from pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
rename to pytorch_lightning/plugins/training_type/sharded_spawn.py
index 04e171bb9d5a0..c1020457e3bec 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py
@@ -1,6 +1,6 @@
 from typing import Optional
 
-from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin
+from pytorch_lightning.plugins .training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
diff --git a/pytorch_lightning/accelerators/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py
similarity index 89%
rename from pytorch_lightning/accelerators/plugins/training_type/single_device.py
rename to pytorch_lightning/plugins/training_type/single_device.py
index 200072ee82651..c83d9685c428c 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/single_device.py
+++ b/pytorch_lightning/plugins/training_type/single_device.py
@@ -1,6 +1,6 @@
 import torch
 
-from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin
+from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin
 
 
 class SingleDevicePlugin(TrainingTypePlugin):
diff --git a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
similarity index 97%
rename from pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
rename to pytorch_lightning/plugins/training_type/training_type_plugin.py
index c5e400494e82c..363dde8e593f3 100644
--- a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -5,7 +5,7 @@
 import torch
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.plugins.base_plugin import Plugin
+from pytorch_lightning.plugins .base_plugin import Plugin
 
 
 class TrainingTypePlugin(Plugin, ABC):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index b3510f0f400fe..bedd4c57f749d 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -18,7 +18,7 @@
 import numpy as np
 import torch
 
-from pytorch_lightning.accelerators.plugins import ParallelPlugin
+from pytorch_lightning.plugins import ParallelPlugin
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.memory import ModelSummary
diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py
index 92950274e49cd..79b0505fcdcba 100644
--- a/tests/backends/test_accelerator_connector.py
+++ b/tests/backends/test_accelerator_connector.py
@@ -22,8 +22,8 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
-from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin
-from pytorch_lightning.accelerators.plugins import PrecisionPlugin
+from pytorch_lightning.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin
+from pytorch_lightning.plugins import PrecisionPlugin
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
 from tests.base.boring_model import BoringModel
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index 0bd13db5a9052..bc4a21db554af 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -5,7 +5,7 @@
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin, \
+from pytorch_lightning.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin, \
     ShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE

From 45dfbb7b11b123b497fd70de0901d9d1248aaaab Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Mon, 25 Jan 2021 10:42:33 +0100
Subject: [PATCH 123/157] restore legacy

---
 .../accelerators/legacy/accelerator.py        | 255 +++++++++++++++
 .../legacy/accelerator_connector.py           |  42 ++-
 .../accelerators/legacy/cpu_accelerator.py    |   2 +-
 .../accelerators/legacy/ddp2_accelerator.py   | 269 ++++++++++++++++
 .../accelerators/legacy/ddp_accelerator.py    |   2 +-
 .../legacy/ddp_cpu_hpc_accelerator.py         |  48 +++
 .../legacy/ddp_cpu_spawn_accelerator.py       | 297 ++++++++++++++++++
 .../legacy/ddp_hpc_accelerator.py             |   2 +-
 .../legacy/ddp_spawn_accelerator.py           |   2 +-
 .../accelerators/legacy/dp_accelerator.py     | 189 +++++++++++
 .../accelerators/legacy/gpu_accelerator.py    | 109 +++++++
 .../legacy/horovod_accelerator.py             |   2 +-
 .../accelerators/legacy/tpu_accelerator.py    |   2 +-
 13 files changed, 1200 insertions(+), 21 deletions(-)
 create mode 100644 pytorch_lightning/accelerators/legacy/accelerator.py
 create mode 100644 pytorch_lightning/accelerators/legacy/ddp2_accelerator.py
 create mode 100644 pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py
 create mode 100644 pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py
 create mode 100644 pytorch_lightning/accelerators/legacy/dp_accelerator.py
 create mode 100644 pytorch_lightning/accelerators/legacy/gpu_accelerator.py

diff --git a/pytorch_lightning/accelerators/legacy/accelerator.py b/pytorch_lightning/accelerators/legacy/accelerator.py
new file mode 100644
index 0000000000000..ea6b21e714b2f
--- /dev/null
+++ b/pytorch_lightning/accelerators/legacy/accelerator.py
@@ -0,0 +1,255 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import contextmanager
+from typing import Any, Optional, Union
+
+import torch
+from torch.optim import Optimizer
+
+from pytorch_lightning.cluster_environments import ClusterEnvironment
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
+from pytorch_lightning.utilities.apply_func import move_data_to_device
+from pytorch_lightning.utilities.parsing import AttributeDict
+
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+else:
+    class ReduceOp:
+        SUM = None
+
+
+class Accelerator(object):
+
+    def __init__(self,
+                 trainer: Optional = None,
+                 cluster_environment: Optional[ClusterEnvironment] = None,
+                 ddp_plugin: Optional[DDPPlugin] = None):
+        self.trainer = trainer
+        self.nickname = None
+        self.cluster_environment = cluster_environment
+        self.dist = AttributeDict(rank=0, device=None)
+        self.ddp_plugin = ddp_plugin
+
+        if trainer is not None:
+            self.train_loop = self.trainer.train
+            self.validation_loop = self.trainer.run_evaluation
+            self.test_loop = self.trainer.run_evaluation
+
+    def setup(self, model):
+        pass
+
+    def teardown(self):
+        # Ensure if necessary all processes are finished
+        self.barrier()
+
+    def barrier(self, name: Optional[str] = None):
+        pass
+
+    def broadcast(self, obj, src=0):
+        return obj
+
+    def train_or_test(self):
+        if self.trainer.testing:
+            results = self.trainer.run_test()
+        else:
+            results = self.trainer.train()
+        return results
+
+    def batch_to_device(self, batch: Any, device: torch.device):
+        model = self.trainer.get_model()
+        if model is not None:
+            return model.transfer_batch_to_device(batch, device)
+        return move_data_to_device(batch, device)
+
+    def training_step_end(self, output):
+        return output
+
+    def test_step_end(self, output):
+        return output
+
+    def validation_step_end(self, output):
+        return output
+
+    def process_dataloader(self, dataloader):
+        return dataloader
+
+    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+        automatic_optimization = self.trainer.train_loop.automatic_optimization
+
+        if not automatic_optimization and self.ddp_plugin is not None:
+            # Manually prepare for reduce as user calling backwards manually
+            self.ddp_plugin.on_before_manual_backward(self.trainer.model, closure_loss)
+
+        if self.trainer.precision == 16:
+            closure_loss = self.trainer.precision_connector.backend.backward(
+                closure_loss, optimizer, opt_idx, *args, **kwargs
+            )
+        else:
+            # do backward pass
+            model = self.trainer.get_model()
+            model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
+
+            # once backward has been applied, release graph
+            closure_loss = closure_loss.detach()
+        return closure_loss
+
+    def clip_gradients(self, optimizer, clip_val=None):
+        # use the trainer's clip val if none passed
+        grad_clip_val = self.trainer.gradient_clip_val
+        if clip_val is not None:
+            grad_clip_val = clip_val
+        grad_clip_val = float(grad_clip_val)
+
+        if grad_clip_val <= 0:
+            return
+        self._clip_gradients(optimizer, grad_clip_val)
+
+    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
+        if self.trainer.amp_backend:
+            self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type)
+        else:
+            model = self.trainer.get_model()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
+
+    def on_train_epoch_end(self, outputs):
+        pass
+
+    def on_train_end(self):
+        pass
+
+    def early_stopping_should_stop(self, pl_module):
+        return self.trainer.should_stop
+
+    def setup_optimizers(self, model):
+        if self.trainer.testing:
+            return
+
+        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
+        self.trainer.optimizers = optimizers
+        self.trainer.lr_schedulers = lr_schedulers
+        self.trainer.optimizer_frequencies = optimizer_frequencies
+
+    def init_ddp_connection(
+            self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True
+    ) -> None:
+        self.ddp_plugin.init_ddp_connection(
+            self.trainer,
+            self.cluster_environment,
+            global_rank,
+            world_size,
+            is_slurm_managing_tasks,
+        )
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        """
+        Function to reduce a tensor from several distributed processes to one aggregated tensor.
+
+        Args:
+            tensor: the tensor to sync and reduce
+            group: the process group to gather results from. Defaults to all processes (world)
+            reduce_op: the reduction operation. Defaults to sum.
+                Can also be a string of 'avg', 'mean' to calculate the mean during reduction.
+
+        Return:
+            reduced value
+        """
+        raise NotImplementedError()
+
+    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+        """
+        Function to gather a tensor from several distributed processes
+
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: the process group to gather results from. Defaults to all processes (world)
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        raise NotImplementedError()
+
+    def optimizer_state(self, optimizer: Optimizer) -> dict:
+        """
+        Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom
+        plugins.
+        Return:
+            Optimizer state dict
+        """
+        if self.ddp_plugin:
+            return self.ddp_plugin.optimizer_state(optimizer)
+        return optimizer.state_dict()
+
+    def get_reference_model(self, model) -> LightningModule:
+        """
+        Override to modify returning base :class:`LightningModule`
+        when accessing variable and functions if the accelerator has wrapped the model.
+
+        Example::
+            ref_model = accelerator.get_reference_model(model)
+            ref_model.training_step(...)
+
+        Args:
+            model: Accelerator model.
+
+        Returns: Reference :class:`LightningModule`.
+
+        """
+        return model
+
+    def __getstate__(self):
+        return {
+            'trainer': self.trainer,
+            'nickname': self.nickname,
+            'cluster_environment': self.cluster_environment,
+            'dist': self.dist,
+            'ddp_plugin': self.ddp_plugin
+        }
+
+    def __setstate__(self, d):
+        self.trainer = d['trainer']
+        self.nickname = d['nickname']
+        self.cluster_environment = d['cluster_environment']
+        self.dist = d['dist']
+        self.ddp_plugin = d['ddp_plugin']
+
+    def on_save(self, checkpoint):
+        return checkpoint
+
+    @property
+    def rpc_enabled(self):
+        return self.ddp_plugin is not None and isinstance(self.ddp_plugin, RPCPlugin)
+
+    @property
+    def distributed_sampler_kwargs(self):
+        raise NotImplementedError
+
+    @property
+    def require_distributed_sampler(self):
+        raise NotImplementedError
+
+    @contextmanager
+    def block_ddp_plugin_sync_behaviour(self):
+        """
+        Blocks ddp sync gradients behaviour on backwards pass.
+        This is useful for skipping sync when accumulating gradients, reducing communication overhead
+        Returns: context manager with sync behaviour off
+        """
+        cm = self.ddp_plugin.block_backward_sync(self.trainer.model) if self.ddp_plugin else None
+        yield cm
diff --git a/pytorch_lightning/accelerators/legacy/accelerator_connector.py b/pytorch_lightning/accelerators/legacy/accelerator_connector.py
index d9dcc5cbd0a88..8b5e5314b2c54 100644
--- a/pytorch_lightning/accelerators/legacy/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/legacy/accelerator_connector.py
@@ -16,8 +16,20 @@
 import torch
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning import accelerators
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy import (
+    DDP2Accelerator,
+    DDPCPUHPCAccelerator,
+    DDPHPCAccelerator,
+    DDPSpawnAccelerator,
+    DDPCPUSpawnAccelerator,
+    DDPAccelerator,
+    DataParallelAccelerator,
+    HorovodAccelerator,
+    TPUAccelerator,
+    GPUAccelerator,
+    CPUAccelerator,
+)
 from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
 from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
 from pytorch_lightning.utilities import (
@@ -220,42 +232,42 @@ def select_accelerator(self):
         # TODO: clean-up this branching as most just select class and uses the very same arguments
         # choose the appropriate accelerator backend
         if self.trainer._distrib_type == DistributedType.DDP2:
-            accelerator_backend = accelerators.DDP2Accelerator(
+            accelerator_backend = DDP2Accelerator(
                 self.trainer,
                 cluster_env,
                 self.trainer.plugin_connector.ddp_plugin
             )
 
         elif use_ddp_cpu_slurm:
-            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
+            accelerator_backend = DDPCPUHPCAccelerator(
                 self.trainer,
                 cluster_env,
                 self.trainer.plugin_connector.ddp_plugin
             )
 
         elif use_slurm_ddp:
-            accelerator_backend = accelerators.DDPHPCAccelerator(
+            accelerator_backend = DDPHPCAccelerator(
                 self.trainer,
                 cluster_env,
                 self.trainer.plugin_connector.ddp_plugin
             )
 
         elif use_ddp_cpu_torch_elastic:
-            accelerator_backend = accelerators.DDPCPUHPCAccelerator(
+            accelerator_backend = DDPCPUHPCAccelerator(
                 self.trainer,
                 cluster_env,
                 self.trainer.plugin_connector.ddp_plugin
             )
 
         elif use_torchelastic_ddp:
-            accelerator_backend = accelerators.DDPHPCAccelerator(
+            accelerator_backend = DDPHPCAccelerator(
                 self.trainer,
                 cluster_env,
                 self.trainer.plugin_connector.ddp_plugin
             )
 
         elif self.trainer._distrib_type == DistributedType.DDP_SPAWN:
-            accelerator_backend = accelerators.DDPSpawnAccelerator(
+            accelerator_backend = DDPSpawnAccelerator(
                 self.trainer,
                 nprocs=self.trainer.num_processes,
                 cluster_environment=cluster_env,
@@ -263,7 +275,7 @@ def select_accelerator(self):
             )
 
         elif use_ddp_cpu_spawn:
-            accelerator_backend = accelerators.DDPCPUSpawnAccelerator(
+            accelerator_backend = DDPCPUSpawnAccelerator(
                 self.trainer,
                 nprocs=self.trainer.num_processes,
                 cluster_environment=cluster_env,
@@ -271,26 +283,26 @@ def select_accelerator(self):
             )
 
         elif self.trainer.distributed_backend == "ddp":
-            accelerator_backend = accelerators.DDPAccelerator(
+            accelerator_backend = DDPAccelerator(
                 self.trainer,
                 cluster_env,
                 ddp_plugin=self.trainer.plugin_connector.ddp_plugin
             )
 
         elif self.trainer._distrib_type == DistributedType.DP:
-            accelerator_backend = accelerators.DataParallelAccelerator(self.trainer, cluster_env)
+            accelerator_backend = DataParallelAccelerator(self.trainer, cluster_env)
 
         elif self.trainer._distrib_type == DistributedType.HOROVOD:
-            accelerator_backend = accelerators.HorovodAccelerator(self.trainer, cluster_env)
+            accelerator_backend = HorovodAccelerator(self.trainer, cluster_env)
 
         elif self.trainer._device_type == DeviceType.GPU and self.trainer.num_gpus == 1:
-            accelerator_backend = accelerators.GPUAccelerator(self.trainer, cluster_env)
+            accelerator_backend = GPUAccelerator(self.trainer, cluster_env)
 
         elif self.trainer._device_type == DeviceType.TPU:
-            accelerator_backend = accelerators.TPUAccelerator(self.trainer, cluster_env)
+            accelerator_backend = TPUAccelerator(self.trainer, cluster_env)
 
         elif self.trainer.distributed_backend is None:
-            accelerator_backend = accelerators.CPUAccelerator(self.trainer, cluster_env)
+            accelerator_backend = CPUAccelerator(self.trainer, cluster_env)
         else:
             raise MisconfigurationException(
                 f'`Trainer(accelerator={self.trainer.distributed_backend}, num_nodes={self.trainer.num_nodes},'
diff --git a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py
index efe14ff6b9b4b..e7d42e2647e93 100644
--- a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py
@@ -15,7 +15,7 @@
 
 import torch
 
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.distributed import ReduceOp
diff --git a/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py
new file mode 100644
index 0000000000000..95ea4ab2686da
--- /dev/null
+++ b/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py
@@ -0,0 +1,269 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+from typing import Any, List, Optional, Union
+
+import torch
+import torch.distributed as torch_distrib
+from torch.nn.parallel import DistributedDataParallel
+
+from pytorch_lightning import _logger as log
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
+from pytorch_lightning.cluster_environments import ClusterEnvironment
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.distributed.dist import LightningDistributed
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
+from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available, \
+    ReduceOp
+
+
+class DDP2Accelerator(Accelerator):
+
+    def __init__(self,
+                 trainer,
+                 cluster_environment: Optional[ClusterEnvironment] = None,
+                 ddp_plugin: Optional[DDPPlugin] = None):
+        """
+        Runs training using DDP2 strategy on a cluster
+
+        Example::
+
+            # default
+            trainer = Trainer(accelerator=DDP2Accelerator())
+
+        """
+        super().__init__(trainer, cluster_environment, ddp_plugin)
+        self.task_idx = None
+        self.dist = LightningDistributed()
+        self.nickname = 'ddp2'
+
+    def setup(self, model):
+        self.trainer.model = model
+        self.task_idx = self.cluster_environment.local_rank()
+
+    def train(self):
+        model = self.trainer.model
+        return self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model)
+
+    def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def _step(self, args):
+        args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args)
+        if self.trainer.amp_backend == AMPType.NATIVE:
+            with torch.cuda.amp.autocast():
+                output = self.trainer.model(*args)
+        else:
+            output = self.trainer.model(*args)
+        return output
+
+    def barrier(self, name: Optional[str] = None):
+        if torch_distrib.is_initialized():
+            torch_distrib.barrier()
+
+    def training_step_end(self, output):
+        if isinstance(output, Result):
+            output.dp_reduce()
+        return output
+
+    def validation_step_end(self, output):
+        if isinstance(output, Result):
+            output.dp_reduce()
+        return output
+
+    def test_step_end(self, output):
+        if isinstance(output, Result):
+            output.dp_reduce()
+        return output
+
+    def set_world_ranks(self, process_idx):
+        # Todo: required argument `process_idx` is not used
+        self.trainer.local_rank = self.trainer.node_rank
+        self.trainer.global_rank = self.trainer.node_rank
+        self.trainer.world_size = self.trainer.num_nodes
+
+    def broadcast(self, obj, src=0):
+        return self.dist.broadcast(obj)
+
+    def init_device(self, process_idx):
+        self.trainer.root_gpu = process_idx
+        torch.cuda.set_device(self.trainer.root_gpu)
+
+    def model_to_device(self, model):
+        model.cuda(self.trainer.root_gpu)
+
+    def get_device_ids(self):
+        device_ids = self.trainer.data_parallel_device_ids
+        return device_ids
+
+    def ddp_train(self, process_idx, mp_queue, model):
+        """
+        Entry point for ddp
+
+        Args:
+            process_idx: current process rank
+            mp_queue: multiprocessing queue
+            model: pointer to current :class:`LightningModule`
+
+        Returns:
+            Dict with evaluation results
+
+        """
+        # Todo: required argument `mp_queue` is not used
+        # show progressbar only on progress_rank 0
+        if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
+            self.trainer.progress_bar_callback.disable()
+
+        # determine which process we are and world size
+        self.set_world_ranks(process_idx)
+
+        # set warning rank
+        rank_zero_only.rank = self.trainer.global_rank
+
+        # Initialize cuda device
+        self.init_device(process_idx)
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        model.trainer = self.trainer
+        self.init_ddp_connection(
+            self.trainer.global_rank,
+            self.trainer.world_size,
+            self.trainer.is_slurm_managing_tasks
+        )
+
+        if isinstance(self.ddp_plugin, RPCPlugin):
+            if not self.ddp_plugin.is_main_rpc_process:
+                self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer)
+                self.ddp_plugin.exit_rpc_process()
+                if self.ddp_plugin.return_after_exit_rpc_process:
+                    return
+            else:
+                self.ddp_plugin.on_main_rpc_connection(self.trainer)
+
+        # call setup after the ddp process has connected
+        self.trainer.call_setup_hook(model)
+
+        # on world_size=0 let everyone know training is starting
+        if self.trainer.is_global_zero and not torch.distributed.is_initialized():
+            log.info('-' * 100)
+            log.info(f'distributed_backend={self.trainer.distributed_backend}')
+            log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
+            log.info('-' * 100)
+
+        # call sync_bn before .cuda(), configure_apex and configure_ddp
+        if self.trainer.sync_batchnorm:
+            model = self.configure_sync_batchnorm(model)
+
+        # move the model to the correct device
+        self.model_to_device(model)
+
+        # CHOOSE OPTIMIZER
+        # allow for lr schedulers as well
+        self.setup_optimizers(model)
+
+        self.ddp_plugin.on_after_setup_optimizers(self.trainer)
+
+        # set model properties before going into wrapper
+        self.trainer.model_connector.copy_trainer_model_properties(model)
+
+        # 16-bit
+        model = self.trainer.precision_connector.connect(model)
+
+        self.trainer.convert_to_lightning_optimizers()
+
+        # device ids change depending on the DDP setup
+        device_ids = self.get_device_ids()
+
+        # allow user to configure ddp
+        model = self.configure_ddp(model, device_ids)
+
+        # set up training routine
+        self.trainer.train_loop.setup_training(model)
+
+        # train or test
+        results = self.train_or_test()
+
+        # clean up memory
+        torch.cuda.empty_cache()
+        return results
+
+    def configure_ddp(
+            self, model: LightningModule, device_ids: List[int]
+    ) -> DistributedDataParallel:
+        model = self.ddp_plugin.configure_ddp(model, device_ids)
+        return model
+
+    def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
+        """
+        Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override to synchronize batchnorm between specific process groups instead
+        of the whole world or use a different sync_bn like `apex`'s version.
+
+        Args:
+            model: pointer to current :class:`LightningModule`.
+
+        Return:
+            LightningModule with batchnorm layers synchronized between process groups
+        """
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
+
+        return model
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+        """
+        Function to gather a tensor from several distributed processes
+
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: the process group to gather results from. Defaults to all processes (world)
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads)
+
+    def get_reference_model(self, model) -> LightningModule:
+        return self.ddp_plugin.get_model_from_plugin(model)
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(
+            num_replicas=self.trainer.num_nodes,
+            rank=self.trainer.global_rank
+        )
+        if self.ddp_plugin is not None:
+            distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs)
+        return distributed_sampler_kwargs
+
+    @property
+    def require_distributed_sampler(self):
+        return True
diff --git a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
index 729ae2ec2ba94..ff0466662226a 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py
@@ -26,7 +26,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
diff --git a/pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py
new file mode 100644
index 0000000000000..8ec4d18509cab
--- /dev/null
+++ b/pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py
@@ -0,0 +1,48 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+from typing import Optional
+
+from pytorch_lightning.accelerators.legacy.ddp_hpc_accelerator import DDPHPCAccelerator
+from pytorch_lightning.cluster_environments import ClusterEnvironment
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+
+
+class DDPCPUHPCAccelerator(DDPHPCAccelerator):
+
+    def __init__(self,
+                 trainer,
+                 cluster_environment: Optional[ClusterEnvironment] = None,
+                 ddp_plugin: Optional[DDPPlugin] = None):
+        """
+        Runs training using DDP (with CPUs) strategy on a cluster
+
+        Example::
+
+            # default
+            trainer = Trainer(accelerator=DDPCPUHPCAccelerator())
+
+        """
+        super().__init__(trainer, cluster_environment, ddp_plugin)
+        self.nickname = 'ddp_cpu'
+
+    def model_to_device(self, model, process_idx):
+        # Todo: required argument `process_idx` is not used
+        model.cpu()
+
+    def get_device_ids(self):
+        device_ids = None
+        return device_ids
+
+    def init_device(self, process_idx):
+        pass
diff --git a/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py
new file mode 100644
index 0000000000000..1559ad671e4d8
--- /dev/null
+++ b/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py
@@ -0,0 +1,297 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import os
+from typing import Any, List, Optional, Union
+
+import torch
+import torch.distributed as torch_distrib
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel
+
+from pytorch_lightning import _logger as log
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
+from pytorch_lightning.cluster_environments import ClusterEnvironment
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.distributed.dist import LightningDistributed
+from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
+from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities.distributed import (
+    all_gather_ddp_if_available,
+    find_free_network_port,
+    rank_zero_only,
+    rank_zero_warn,
+    sync_ddp_if_available, ReduceOp,
+)
+
+
+class DDPCPUSpawnAccelerator(Accelerator):
+
+    def __init__(self,
+                 trainer,
+                 nprocs: int,
+                 cluster_environment: Optional[ClusterEnvironment] = None,
+                 ddp_plugin: Optional[DDPPlugin] = None):
+        """
+        Runs training using DDP (on a single machine or manually on multiple machines), using mp.spawn
+
+        Example::
+
+            # default
+            trainer = Trainer(accelerator=DDPCPUSpawnAccelerator())
+
+        """
+        super().__init__(trainer, cluster_environment, ddp_plugin)
+        self.mp_queue = None
+        self.nprocs = nprocs
+        self.dist = LightningDistributed()
+        self.nickname = 'ddp_cpu'
+
+    def setup(self, model):
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
+
+        # pass in a state q
+        smp = mp.get_context('spawn')
+        self.mp_queue = smp.SimpleQueue()
+
+        self.trainer.model = model
+
+    def train(self):
+        model = self.trainer.model
+
+        # train in children process
+        mp.spawn(self.ddp_train, nprocs=self.nprocs, args=(self.mp_queue, model,))
+
+        # restore main state with best weights
+        best_path = self.mp_queue.get()
+        results = self.mp_queue.get()
+
+        # recover the weights of the processes trained in the children
+        self.__recover_child_process_weights(model, best_path)
+        return results
+
+    def ddp_train(self, process_idx, mp_queue, model):
+        """
+        Entry point for ddp
+
+        Args:
+            process_idx:
+            mp_queue: multiprocessing queue
+            model:
+        """
+        # show progressbar only on progress_rank 0
+        if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
+            self.trainer.progress_bar_callback.disable()
+
+        # determine which process we are and world size
+        self.set_world_ranks(process_idx)
+
+        # set warning rank
+        rank_zero_only.rank = self.trainer.global_rank
+
+        # set up server using proc 0's ip address
+        # try to init for 20 times at max in case ports are taken
+        # where to store ip_table
+        model.trainer = self.trainer
+        self.init_ddp_connection(
+            self.trainer.global_rank,
+            self.trainer.world_size,
+            self.trainer.is_slurm_managing_tasks
+        )
+
+        if isinstance(self.ddp_plugin, RPCPlugin):
+            if not self.ddp_plugin.is_main_rpc_process:
+                self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer)
+                self.ddp_plugin.exit_rpc_process()
+                if self.ddp_plugin.return_after_exit_rpc_process:
+                    return
+            else:
+                self.ddp_plugin.on_main_rpc_connection(self.trainer)
+
+        # call setup after the ddp process has connected
+        self.trainer.call_setup_hook(model)
+
+        # on world_size=0 let everyone know training is starting
+        if self.trainer.is_global_zero and not torch.distributed.is_initialized():
+            log.info('-' * 100)
+            log.info(f'distributed_backend={self.trainer.distributed_backend}')
+            log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
+            log.info('-' * 100)
+
+        # call sync_bn before .cuda(), configure_apex and configure_ddp
+        if self.trainer.sync_batchnorm:
+            model = self.configure_sync_batchnorm(model)
+
+        # move the model to the correct device
+        self.model_to_device(model, process_idx)
+
+        # CHOOSE OPTIMIZER
+        # allow for lr schedulers as well
+        self.setup_optimizers(model)
+
+        self.ddp_plugin.on_after_setup_optimizers(self.trainer)
+
+        # set model properties before going into wrapper
+        self.trainer.model_connector.copy_trainer_model_properties(model)
+
+        # 16-bit
+        model = self.trainer.precision_connector.connect(model)
+
+        self.trainer.convert_to_lightning_optimizers()
+
+        # DDP spawn already spawned off each process... no need to do anything
+        device_ids = self.get_device_ids()
+
+        # allow user to configure ddp
+        model = self.configure_ddp(model, device_ids)
+
+        # set up training routine
+        self.trainer.train_loop.setup_training(model)
+
+        # train or test
+        results = self.train_or_test()
+
+        # get original model
+        model = self.trainer.get_model()
+
+        # persist info in ddp_spawn
+        self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
+
+        # clean up memory
+        torch.cuda.empty_cache()
+
+    def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def _step(self, args):
+        args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args)
+        if self.trainer.amp_backend == AMPType.NATIVE:
+            with torch.cuda.amp.autocast():
+                output = self.trainer.model(*args)
+        else:
+            output = self.trainer.model(*args)
+        return output
+
+    def barrier(self, name: Optional[str] = None):
+        if torch_distrib.is_initialized():
+            torch_distrib.barrier()
+
+    def broadcast(self, obj, src=0):
+        return self.dist.broadcast(obj)
+
+    def early_stopping_should_stop(self, pl_module):
+        stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device)
+        torch_distrib.all_reduce(stop, op=torch_distrib.reduce_op.SUM)
+        torch_distrib.barrier()
+        should_stop = stop == self.trainer.world_size
+        return should_stop
+
+    def set_world_ranks(self, process_idx):
+        self.trainer.local_rank = process_idx
+        self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx
+        self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes
+
+    def model_to_device(self, model, process_idx):
+        # Todo: required argument `process_idx` is not used
+        model.cpu()
+
+    def get_device_ids(self):
+        device_ids = None
+        return device_ids
+
+    def __recover_child_process_weights(self, model, best_path):
+        # transfer back the best path to the trainer
+        if self.trainer.checkpoint_callback:
+            self.trainer.checkpoint_callback.best_model_path = best_path
+
+        self.trainer.model = model
+
+    def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
+        # Todo: required argument `model` is not used
+        # track the best model path
+        best_model_path = None
+        if self.trainer.checkpoint_callback is not None:
+            best_model_path = self.trainer.checkpoint_callback.best_model_path
+
+        if self.trainer.global_rank == 0 and mp_queue is not None:
+            rank_zero_warn('cleaning up ddp environment...')
+            # todo, pass complete checkpoint as state dictionary
+            mp_queue.put(best_model_path)
+            mp_queue.put(results)
+
+    def configure_ddp(
+            self, model: LightningModule, device_ids: List[int]
+    ) -> DistributedDataParallel:
+        model = self.ddp_plugin.configure_ddp(model, device_ids)
+        return model
+
+    def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule:
+        """
+        Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override to synchronize batchnorm between specific process groups instead
+        of the whole world or use a different sync_bn like `apex`'s version.
+
+        Args:
+            model: pointer to current :class:`LightningModule`.
+
+        Return:
+            LightningModule with batchnorm layers synchronized between process groups
+        """
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
+
+        return model
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+        """
+        Function to gather a tensor from several distributed processes
+
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: the process group to gather results from. Defaults to all processes (world)
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads)
+
+    def get_reference_model(self, model) -> LightningModule:
+        return self.ddp_plugin.get_model_from_plugin(model)
+
+    @property
+    def distributed_sampler_kwargs(self):
+        distributed_sampler_kwargs = dict(
+            num_replicas=self.trainer.num_nodes * self.trainer.num_processes,
+            rank=self.trainer.global_rank
+        )
+        if self.ddp_plugin is not None:
+            distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs)
+        return distributed_sampler_kwargs
+
+    @property
+    def require_distributed_sampler(self):
+        return True
diff --git a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
index 58fd60ac18a69..0d45300e0106e 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py
@@ -21,7 +21,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed.dist import LightningDistributed
diff --git a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
index 39871a6c6d344..e2e9e3062a909 100644
--- a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py
@@ -23,7 +23,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.distributed import LightningDistributed
diff --git a/pytorch_lightning/accelerators/legacy/dp_accelerator.py b/pytorch_lightning/accelerators/legacy/dp_accelerator.py
new file mode 100644
index 0000000000000..13bed9082c24a
--- /dev/null
+++ b/pytorch_lightning/accelerators/legacy/dp_accelerator.py
@@ -0,0 +1,189 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+from torch import optim
+
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
+from pytorch_lightning.cluster_environments import ClusterEnvironment
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.distributed import LightningDistributed
+from pytorch_lightning.overrides.data_parallel import LightningDataParallel
+from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+
+class DataParallelAccelerator(Accelerator):
+
+    def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None):
+        """
+        Runs training using DP via manual start (not HPC cluster)
+
+        Example::
+
+            # default
+            trainer = Trainer(accelerator=DataParallelAccelerator())
+
+        """
+        super().__init__(trainer, cluster_environment)
+        self.model_autocast_original_forward = None
+        self.dist = LightningDistributed()
+        self.nickname = 'dp'
+
+    def setup(self, model):
+        # call setup after the ddp process has connected
+        self.trainer.call_setup_hook(model)
+
+        # put model on correct device
+        model.cuda(self.trainer.root_gpu)
+
+        # CHOOSE OPTIMIZER
+        # allow for lr schedulers as well
+        self.setup_optimizers(model)
+
+        # init torch data parallel
+        model = self.__init_torch_data_parallel(model)
+
+        # hack forward to do autocast for the user
+        self.model_autocast_original_forward = model.forward
+
+        # init half precision
+        if self.trainer.amp_backend:
+            model = self.__init_half_precision(model)
+
+        self.trainer.convert_to_lightning_optimizers()
+
+        self.trainer.model = model
+
+    def __init_torch_data_parallel(self, model):
+        # create list of device ids
+        device_ids = self.trainer.data_parallel_device_ids
+        if isinstance(device_ids, int):
+            device_ids = list(range(device_ids))
+
+        # set dp device
+        torch.cuda.set_device(self.trainer.root_gpu)
+        model = LightningDataParallel(model, device_ids=device_ids)
+        return model
+
+    def __init_half_precision(self, model):
+        if self.trainer.amp_backend == AMPType.NATIVE:
+            self.__init_native_amp(model)
+        else:
+            model = self.__init_nvidia_apex(model)
+        return model
+
+    def __init_native_amp(self, model):
+        model.forward = torch.cuda.amp.autocast()(model.forward)
+
+    def __init_nvidia_apex(self, model):
+        # check for this bug (amp + dp + !01 doesn't work)
+        # https://github.com/NVIDIA/apex/issues/227
+        if self.trainer.amp_level == 'O2':
+            raise MisconfigurationException(
+                f'Amp level {self.trainer.amp_level} with DataParallel is not supported.'
+                f' See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.'
+                f' We recommend you switch to ddp if you want to use amp')
+        else:
+            model = self.trainer.precision_connector.connect(model)
+
+        return model
+
+    def train(self):
+        model = self.trainer.model
+        # set up training routine
+        self.trainer.train_loop.setup_training(model)
+
+        # train or test
+        results = self.train_or_test()
+
+        return results
+
+    def teardown(self):
+        # replace the original fwd function
+        self.trainer.model.forward = self.model_autocast_original_forward
+        self.barrier()
+
+    def _step(self, args):
+        if self.trainer.amp_backend == AMPType.NATIVE:
+            with torch.cuda.amp.autocast():
+                output = self.trainer.model(*args)
+        else:
+            output = self.trainer.model(*args)
+        return output
+
+    def training_step(self, args):
+        return self._step(args)
+
+    def validation_step(self, args):
+        return self._step(args)
+
+    def test_step(self, args):
+        return self._step(args)
+
+    def training_step_end(self, output):
+        if isinstance(output, Result):
+            output.dp_reduce()
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
+        return output
+
+    def validation_step_end(self, output):
+        if isinstance(output, Result):
+            output.dp_reduce()
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
+        return output
+
+    def test_step_end(self, output):
+        if isinstance(output, Result):
+            output.dp_reduce()
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
+        return output
+
+    def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
+        """
+        Reinitialize optimizer.step properties added by schedulers
+        """
+        for scheduler in schedulers:
+            scheduler = scheduler['scheduler']
+
+            for optimizer in optimizers:
+                # check that we dont mix users optimizers and schedulers
+                if scheduler.optimizer == optimizer:
+                    # Find the mro belonging to the base lr scheduler class
+                    for i, mro in enumerate(scheduler.__class__.__mro__):
+                        is_regular_scheduler = optim.lr_scheduler._LRScheduler
+                        is_lr_reduce_on_plateau = optim.lr_scheduler.ReduceLROnPlateau
+                        if is_regular_scheduler or is_lr_reduce_on_plateau:
+                            idx = i
+                            state = scheduler.state_dict()
+                        else:
+                            state = None
+
+                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
+                if state is not None:
+                    scheduler.load_state_dict(state)
+
+    def get_reference_model(self, model) -> LightningModule:
+        if isinstance(model, LightningDataParallel):
+            return model.module
+        return model
+
+    @property
+    def require_distributed_sampler(self):
+        return False
diff --git a/pytorch_lightning/accelerators/legacy/gpu_accelerator.py b/pytorch_lightning/accelerators/legacy/gpu_accelerator.py
new file mode 100644
index 0000000000000..2314a8c8c7987
--- /dev/null
+++ b/pytorch_lightning/accelerators/legacy/gpu_accelerator.py
@@ -0,0 +1,109 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
+from pytorch_lightning.cluster_environments import ClusterEnvironment
+from pytorch_lightning.distributed.dist import LightningDistributed
+from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities.distributed import ReduceOp
+
+
+class GPUAccelerator(Accelerator):
+    amp_backend: AMPType
+
+    def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None):
+        """
+        Runs training using a single GPU
+
+        Example::
+
+            # default
+            trainer = Trainer(accelerator=GPUAccelerator())
+
+        """
+        super().__init__(trainer, cluster_environment)
+        self.dist = LightningDistributed()
+        self.nickname = None
+
+    def setup(self, model):
+
+        # call setup
+        self.trainer.call_setup_hook(model)
+
+        torch.cuda.set_device(self.trainer.root_gpu)
+        model.cuda(self.trainer.root_gpu)
+
+        # CHOOSE OPTIMIZER
+        # allow for lr schedulers as well
+        self.setup_optimizers(model)
+
+        # 16-bit
+        model = self.trainer.precision_connector.connect(model)
+
+        self.trainer.convert_to_lightning_optimizers()
+
+        self.trainer.model = model
+
+    def train(self):
+        model = self.trainer.model
+
+        # set up training routine
+        self.trainer.train_loop.setup_training(model)
+
+        # train or test
+        results = self.train_or_test()
+        return results
+
+    def _step(self, model_step: Callable, args):
+        args[0] = self.to_device(args[0])
+
+        if self.trainer.amp_backend == AMPType.NATIVE:
+            with torch.cuda.amp.autocast():
+                output = model_step(*args)
+        else:
+            output = model_step(*args)
+
+        return output
+
+    def training_step(self, args):
+        return self._step(self.trainer.model.training_step, args)
+
+    def validation_step(self, args):
+        return self._step(self.trainer.model.validation_step, args)
+
+    def test_step(self, args):
+        return self._step(self.trainer.model.test_step, args)
+
+    def to_device(self, batch):
+        gpu_id = 0
+        if isinstance(self.trainer.data_parallel_device_ids, list):
+            gpu_id = self.trainer.data_parallel_device_ids[0]
+
+        # Don't copy the batch since there is a single gpu that the batch could
+        # be referenced from and if there are multiple optimizers the batch will
+        # wind up copying it to the same device repeatedly.
+        return self.batch_to_device(batch, gpu_id)
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        return tensor
+
+    @property
+    def require_distributed_sampler(self):
+        return False
diff --git a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py
index 7d41dd990e7ad..dd9cd911d97d5 100644
--- a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py
@@ -17,7 +17,7 @@
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
 
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, AMPType, DeviceType
 from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp
diff --git a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
index 158978cbcbba9..4cdf3354556d5 100644
--- a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
@@ -21,7 +21,7 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import (

From 9b7326a25c68b89d41105df80e8e24fb9c7decb8 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Mon, 25 Jan 2021 11:09:38 +0100
Subject: [PATCH 124/157] drop test.py from root

---
 test.py | 97 ---------------------------------------------------------
 1 file changed, 97 deletions(-)
 delete mode 100644 test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index 959436c179c21..0000000000000
--- a/test.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import torch
-import pytorch_lightning as pl
-
-class RandomDataset(torch.utils.data.Dataset):
-    def __init__(self, size, length):
-        self.len = length
-        self.data = torch.randn(length, size)
-
-    def __getitem__(self, index):
-        return self.data[index]
-
-    def __len__(self):
-        return self.len
-
-
-class BoringModel(pl.LightningModule):
-
-    def __init__(self):
-        """
-        Testing PL Module
-
-        Use as follows:
-        - subclass
-        - modify the behavior for what you want
-
-        class TestModel(BaseTestModel):
-            def training_step(...):
-                # do your own thing
-
-        or:
-
-        model = BaseTestModel()
-        model.training_epoch_end = None
-
-        """
-        super().__init__()
-        self.layer = torch.nn.Linear(32, 2)
-
-    def forward(self, x):
-        return self.layer(x)
-
-    def loss(self, batch, prediction):
-        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
-        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
-
-    def step(self, x):
-        x = self(x)
-        out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
-        return out
-
-    def training_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        return {"loss": loss}
-
-    def training_step_end(self, training_step_outputs):
-        return training_step_outputs
-
-    def training_epoch_end(self, outputs) -> None:
-        torch.stack([x["loss"] for x in outputs]).mean()
-
-    def validation_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        return {"x": loss}
-
-    # def validation_epoch_end(self, outputs) -> None:
-    #     torch.stack([x['x'] for x in outputs]).mean()
-
-    def test_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        return {"y": loss}
-
-    def test_epoch_end(self, outputs) -> None:
-        torch.stack([x["y"] for x in outputs]).mean()
-
-    def configure_optimizers(self):
-        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-        return [optimizer], [lr_scheduler]
-
-    def train_dataloader(self):
-        return torch.utils.data.DataLoader(RandomDataset(32, 64))
-
-    def val_dataloader(self):
-        return torch.utils.data.DataLoader(RandomDataset(32, 64))
-
-    def test_dataloader(self):
-        return torch.utils.data.DataLoader(RandomDataset(32, 64))
-
-    @property
-    def automatic_optimization(self):
-        return True
-
-if __name__ == '__main__':
-    pl.Trainer(gpus=[1,], max_epochs=20, amp_backend='native').fit(BoringModel(), torch.utils.data.DataLoader(RandomDataset(32, 500)))
\ No newline at end of file

From 96bc05d9d86d6c45a8a0b69525eb3494beaaa794 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Tue, 26 Jan 2021 18:14:01 +0100
Subject: [PATCH 125/157] add tpu accelerator and plugins

---
 pytorch_lightning/accelerators/tpu.py         |  18 +-
 .../plugins/precision/tpu_bfloat.py           |   8 +
 .../plugins/training_type/__init__.py         |  22 ++-
 .../plugins/training_type/ddp_spawn.py        |   2 +-
 .../plugins/training_type/parallel.py         |   4 +-
 .../plugins/training_type/single_device.py    |   6 +-
 .../plugins/training_type/single_tpu.py       |  34 ++++
 .../plugins/training_type/tpu_spawn.py        | 184 ++++++++++++++++++
 pytorch_lightning/trainer/trainer.py          |   2 +
 9 files changed, 260 insertions(+), 20 deletions(-)
 create mode 100644 pytorch_lightning/plugins/precision/tpu_bfloat.py
 create mode 100644 pytorch_lightning/plugins/training_type/single_tpu.py
 create mode 100644 pytorch_lightning/plugins/training_type/tpu_spawn.py

diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index bf922b1c2df8e..1fd6a4f565258 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -1,13 +1,17 @@
-# TODO: Complete the TPUAccelerator
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.plugins.training_type import SingleTPUPlugin, TPUSpawnPlugin
+from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 
 
 class TPUAccelerator(Accelerator):
     def setup(self, trainer, model):
-        raise NotImplementedError
+        if isinstance(self.precision_plugin, MixedPrecisionPlugin):
+            raise MisconfigurationException(
+                "amp + tpu is not supported. "
+                "Only bfloats are supported on TPU. Consider using TPUHalfPrecisionPlugin"
+            )
 
-    def on_train_start(self):
-        raise NotImplementedError
-
-    def on_train_end(self):
-        raise NotImplementedError
+        if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)):
+            raise MisconfigurationException("TPUs only support a single tpu core or tpu spawn training.")
+        return super().setup(trainer, model)
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/precision/tpu_bfloat.py b/pytorch_lightning/plugins/precision/tpu_bfloat.py
new file mode 100644
index 0000000000000..852d2eee6dfc3
--- /dev/null
+++ b/pytorch_lightning/plugins/precision/tpu_bfloat.py
@@ -0,0 +1,8 @@
+import os
+import torch
+from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
+
+class TPUHalfPrecisionPlugin(PrecisionPlugin):
+    def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
+        os.environ['XLA_USE_BF16'] = str(1)
+        return super().connect(model=model, optimizers=optimizers, lr_schedulers=lr_schedulers)
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
index 7109594600a04..7c31c253eb0eb 100644
--- a/pytorch_lightning/plugins/training_type/__init__.py
+++ b/pytorch_lightning/plugins/training_type/__init__.py
@@ -1,10 +1,12 @@
-from pytorch_lightning.plugins .training_type.ddp import DDPPlugin
-from pytorch_lightning.plugins .training_type.ddp2 import DDP2Plugin
-from pytorch_lightning.plugins .training_type.ddp_spawn import DDPSpawnPlugin
-from pytorch_lightning.plugins .training_type.dp import DataParallelPlugin
-from pytorch_lightning.plugins .training_type.horovod import HorovodPlugin
-from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin
-from pytorch_lightning.plugins .training_type.sharded import DDPShardedPlugin
-from pytorch_lightning.plugins .training_type.sharded_spawn import DDPSpawnShardedPlugin
-from pytorch_lightning.plugins .training_type.single_device import SingleDevicePlugin
-from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin
+from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin
+from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
+from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
+from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin
+from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin
+from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
+from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
+from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
+from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
+from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 80886d2555c21..95371b48356b6 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -33,7 +33,7 @@ def __init__(
         parallel_devices,
         num_nodes=1,
         cluster_environment: ClusterEnvironment = None,
-        sync_batchnorm=False,
+        sync_batchnorm: bool = False,
         **kwargs: Dict[str, Any],
     ):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py
index 8bc692b97b3ee..3235f6cef041c 100644
--- a/pytorch_lightning/plugins/training_type/parallel.py
+++ b/pytorch_lightning/plugins/training_type/parallel.py
@@ -90,4 +90,6 @@ def block_backward_sync(self):
         if isinstance(self.model, LightningDistributedDataParallel):
             yield self.model.no_sync()
         else:
-            yield None
\ No newline at end of file
+            yield None
+
+    
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py
index c83d9685c428c..de4193ae3d2fd 100644
--- a/pytorch_lightning/plugins/training_type/single_device.py
+++ b/pytorch_lightning/plugins/training_type/single_device.py
@@ -8,6 +8,10 @@ def __init__(self, device):
         super().__init__()
         self.device: torch.device = device
 
+    @property
+    def on_tpu(self):
+        return self.device.type == 'xla'
+
     @property
     def on_gpu(self):
         return self.device.type == "cuda" and torch.cuda.is_available()
@@ -38,4 +42,4 @@ def barrier(self, *args, **kwargs):
         pass
 
     def broadcast(self, obj: object, src: int = 0) -> object:
-        return obj
\ No newline at end of file
+        return obj
diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py
new file mode 100644
index 0000000000000..ace3405463af3
--- /dev/null
+++ b/pytorch_lightning/plugins/training_type/single_tpu.py
@@ -0,0 +1,34 @@
+import io
+from typing import Optional
+import torch
+from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
+from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn
+
+if _TPU_AVAILABLE:
+    import torch_xla
+    import torch_xla.core.xla_model as xm
+
+
+class SingleTPUPlugin(SingleDevicePlugin):
+    def __init__(self, device):
+        super().__init__(device)
+
+        self.tpu_local_core_rank = 0
+        self.tpu_global_core_rank = 0
+
+    def barrier(self, name: Optional[str] = None):
+        torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}")
+
+    def pre_training(self):
+        if isinstance(self.device, int):
+            self.device = xm.xla_device(self.device)
+
+        self.tpu_local_core_rank = xm.get_local_ordinal()
+        self.tpu_global_core_rank = xm.get_ordinal()
+
+    def post_training(self):
+        model = self.lightning_module
+
+        if self.on_colab_kaggle:
+            rank_zero_warn("cleaning up... please do not interrupt")
+            self.save_spawn_weights(model)
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
new file mode 100644
index 0000000000000..6476c07587e66
--- /dev/null
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -0,0 +1,184 @@
+import io
+import os
+from pytorch_lightning.core.lightning import LightningModule
+import torch
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.seed import seed_everything
+from typing import Any, Dict, Optional
+from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
+from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn,
+
+from pytorch_lightning.utilities.apply_func import move_data_to_device
+
+if _TPU_AVAILABLE:
+    import torch_xla
+    import torch_xla.core.xla_model as xm
+    import torch_xla.distributed.parallel_loader as xla_pl
+    import torch_xla.distributed.xla_multiprocessing as xmp
+
+class TPUSpawnPlugin(DDPSpawnPlugin):
+    def __init__(self, parallel_devices, num_nodes=1, **kwargs: Dict[str, Any]):
+
+        parallel_devices = [xm.xla_device(device) if isinstance(device, int) else device for device in parallel_devices]
+        super().__init__(parallel_devices, num_nodes=num_nodes, cluster_environment=None, sync_batchnorm=False, **kwargs)
+        self.tpu_local_core_rank = 0
+        self.start_method = None
+
+    @property
+    def distributed_sampler_kwargs(self):
+        return dict(num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())
+
+    def process_dataloader(self, dataloader):
+        device = xm.xla_device(self.trainer.tpu_id)
+        dataloader = xla_pl.ParallelLoader(dataloader, [device])
+        dataloader = dataloader.per_device_loader(device)
+        return dataloader
+
+    def configure_ddp(self):
+        pass
+
+    def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
+        pass
+
+    def set_world_ranks(self, process_idx):
+            self.tpu_local_core_rank = xm.get_local_ordinal()
+            self.tpu_global_core_rank = xm.get_ordinal()
+            self.global_rank = self.tpu_local_core_rank
+            self.world_size = self.num_nodes * self.num_processes
+
+    def new_process(self, process_idx, trainer):
+        seed = os.environ.get("PL_GLOBAL_SEED")
+        if seed is not None:
+            seed_everything(int(seed))
+
+        self.set_world_ranks(process_idx)
+
+        # set warning rank
+        rank_zero_only.rank = self.global_rank
+
+        if self.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None:
+            trainer.progress_bar_callback.disable()
+
+        self.model_to_device()
+        self.barrier()
+
+        if trainer.testing:
+            results = trainer.run_test()
+        else:
+            results = trainer.train()
+        
+        self.__save_end_of_training_weights(self.lightning_module)
+        self.transfer_distrib_spawn_state_on_fit_end(results)
+
+    def __save_end_of_training_weights(self, model: LightningModule, trainer):
+        # when training ends on these platforms dump weights to get out of the main process
+        if self.on_colab_kaggle:
+            rank_zero_warn("cleaning up... please do not interrupt")
+            self.save_spawn_weights(model)
+
+    def model_to_device(self):
+        pass
+
+    def barrier(self, name: Optional[str] = None):
+        torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}")
+
+    def on_save(self, checkpoint):
+        """
+        Move XLA tensors to CPU before saving
+        Recommended on XLA Guide:
+        https://github.com/pytorch/xla/blob/master/API_GUIDE.md#saving-and-loading-xla-tensors
+        """
+        return move_data_to_device(checkpoint, torch.device("cpu"))
+
+    @property
+    def on_colab_kaggle(self) -> bool:
+       return bool(os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE'))
+
+    def broadcast(self, obj, src=0):
+        buffer = io.BytesIO()
+        torch.save(obj, buffer)
+        data = bytearray(buffer.getbuffer())
+        data_tensor = torch.tensor(data).to(xm.xla_device(), dtype=torch.float)
+        data = xm.all_gather(data_tensor)
+        buffer = io.BytesIO(data.cpu().byte().numpy())
+        obj = torch.load(buffer)
+        return obj
+
+    def load_spawn_weights(self, original_model):
+        """
+        Load the temp weights saved in the process
+        To recover the trained model from the ddp process we load the saved weights
+        """
+
+        loaded_model = original_model
+
+        if self.is_global_zero:
+            # load weights saved in ddp
+            path = os.path.join(original_model.trainer.default_root_dir, "__temp_weight_distributed_end.ckpt")
+            loaded_model = original_model.__class__.load_from_checkpoint(path)
+
+            # copy loaded weights to old model
+            original_model.load_state_dict(loaded_model.state_dict())
+
+            # remove ddp weights
+            os.remove(path)
+
+        return loaded_model
+
+    def save_spawn_weights(self, model):
+        """
+        Dump a temporary checkpoint after ddp ends to get weights out of the process
+        """
+        if model.trainer.is_global_zero:
+            path = os.path.join(model.trainer.default_root_dir, "__temp_weight_distributed_end.ckpt")
+            model.trainer.save_checkpoint(path)
+            return path
+
+    def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
+        should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device)
+        stop = xm.mesh_reduce('stop_signal', should_stop, sum)
+        torch_xla.core.xla_model.rendezvous("pl.EarlyStoppingCallback.stop_distributed_training_check")
+        should_stop = int(stop.item()) == self.world_size
+        return should_stop
+
+    def post_training(self):
+        # TODO: Check if trainer references can be resolved otherwise
+        model = self.lightning_module
+
+        # restore main state with best weights
+        best_path = self.mp_queue.get()
+        results = self.mp_queue.get()
+        last_path = self.mp_queue.get()
+
+        # transfer back the best path to the trainer
+        if self.lightning_module.trainer.checkpoint_callback is not None:
+            self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path
+        # todo, pass also bets score
+
+        # load last weights
+        if last_path and not self.lightning_module.trainer.testing:
+            ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt)
+
+        self.lightning_module = model
+
+        # when training completes, load the weights back in main process
+        self.__load_weights_on_main_process()
+
+    def __load_weights_on_main_process(self):
+        model = self.lightning_module
+
+        # load weights if not interrupted
+        # TODO: check for trainer reference
+        if self.on_colab_kaggle and not model.trainer.testing:
+            self.load_spawn_weights(model)
+
+        self.lightning_module = model
+
+    def start_training(self, trainer):
+        xmp.spawn(self.new_process, args=(self.lightning_module, trainer, self.mp_queue), 
+                  nproc=len(self.parallel_devices), start_method=self.start_method)
+
+    def start_testing(self, trainer):
+        xmp.spawn(self.new_process, args=(self.lightning_module, trainer, self.mp_queue), 
+                  nproc=len(self.parallel_devices), start_method=self.start_method)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 96f4eaf430101..fe075c5c95783 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -513,6 +513,7 @@ def fit(
         # TODO: the old setup is now called "pre_training", where should this hook be called now?
         self.call_hook("on_before_accelerator_backend_setup", model)
         self.training_type_plugin.pre_training()
+        self.precision_plugin.pre_training()
 
         self.call_setup_hook(self.lightning_module)
 
@@ -522,6 +523,7 @@ def fit(
         else:
             self.training_type_plugin.start_training(self)
 
+        self.precision_plugin.post_training()
         self.training_type_plugin.post_training()
         self.accelerator_backend.teardown()
         results = self.training_type_plugin.results

From 9e46624370e30f1842d1aa4d381fdc931adaaf5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 30 Jan 2021 15:39:46 +0100
Subject: [PATCH 126/157] fixes

---
 pytorch_lightning/plugins/training_type/tpu_spawn.py | 2 +-
 pytorch_lightning/trainer/trainer.py                 | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 6476c07587e66..5de336b16870b 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -6,7 +6,7 @@
 from pytorch_lightning.utilities.seed import seed_everything
 from typing import Any, Dict, Optional
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
-from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn,
+from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn
 
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 56ae98d3665b7..5344a98fdb73f 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -29,7 +29,7 @@
 from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.memory import ModelSummary
-from pytorch_lightning.core.step_result import EvalResult, Result
+from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector
 from pytorch_lightning.profiler import BaseProfiler
@@ -308,7 +308,7 @@ def __init__(
         self.config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self)
         self.optimizer_connector = OptimizerConnector(self)
-        self.plugin_connector = PluginConnector(self, plugins)
+        self.plugin_connector = PluginConnector(self)
         self.accelerator_connector = BackendConnector(
             num_processes,
             tpu_cores,
@@ -419,7 +419,7 @@ def __init__(
 
         # last thing are the plugins which override whatever the trainer used by default
         # TODO: probably not needed anymore after refactor
-        self.plugin_connector.on_trainer_init()
+        self.plugin_connector.on_trainer_init(plugins)
 
         # Callback system
         self.on_init_end()

From e174b8dd8b8081cfac242508371d22f719ec9fe2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 31 Jan 2021 16:08:56 +0100
Subject: [PATCH 127/157] fix lightning optimizer merge

---
 pytorch_lightning/accelerators/accelerator.py            | 1 -
 pytorch_lightning/plugins/__init__.py                    | 5 +++--
 pytorch_lightning/plugins/training_type/horovod.py       | 3 ---
 pytorch_lightning/plugins/training_type/sharded.py       | 2 --
 pytorch_lightning/plugins/training_type/sharded_spawn.py | 3 ---
 pytorch_lightning/trainer/optimizers.py                  | 2 +-
 pytorch_lightning/trainer/training_loop.py               | 6 +-----
 tests/models/test_hooks.py                               | 1 +
 tests/models/test_horovod.py                             | 2 +-
 9 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index c5c77d4711e6a..8dabd4ed7cf75 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -66,7 +66,6 @@ def setup(self, trainer: "Trainer", model: LightningModule) -> None:
         self.connect_training_type_plugin(self.training_type_plugin, model)
         self.setup_optimizers(trainer, model)
         self.connect_precision_plugin(self.precision_plugin)
-        self.optimizers = trainer.convert_to_lightning_optimizers(self.optimizers)
 
     @property
     def model(self) -> torch.nn.Module:
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index a17d5127edfc6..ffb3b76157e98 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -1,3 +1,4 @@
 from pytorch_lightning.plugins.base_plugin import Plugin  # noqa: F401
-from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin  # noqa: F401
-from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin  # noqa: F401
+from pytorch_lightning.plugins.precision import *
+from pytorch_lightning.plugins.training_type import *
+
diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
index a8bd0091eef6d..434eb2f09c1db 100644
--- a/pytorch_lightning/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -86,9 +86,6 @@ def _filter_named_parameters(model, optimizer):
             ) for optimizer in optimizers
         ]
 
-        optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers)
-        self.lightning_module.trainer.optimizers = optimizers
-
     def start_training(self, trainer):
         with ExitStack() as stack:
             for optimizer in trainer.optimizers:
diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py
index fb24f8c73315d..16570492a0dc8 100644
--- a/pytorch_lightning/plugins/training_type/sharded.py
+++ b/pytorch_lightning/plugins/training_type/sharded.py
@@ -32,8 +32,6 @@ def _reinit_optimizers_with_oss(self):
                 )
                 optimizers[x] = zero_optimizer
                 del optimizer
-        trainer = self.lightning_module.trainer
-        trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers)
 
     def _wrap_optimizers(self):
         trainer = self.model.trainer
diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py
index c1020457e3bec..503e78e13618c 100644
--- a/pytorch_lightning/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py
@@ -32,9 +32,6 @@ def _reinit_optimizers_with_oss(self):
                 )
                 optimizers[x] = zero_optimizer
                 del optimizer
-        trainer = self.lightning_module.trainer
-        trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers)
-
 
     def _wrap_optimizers(self):
         trainer = self.model.trainer
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index 996cfc607f825..20438f427d315 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -81,7 +81,7 @@ def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]:
 
         return optimizers, lr_schedulers, optimizer_frequencies
 
-    def convert_to_lightning_optimizers(self, optimizers):
+    def convert_to_lightning_optimizers(self):
         def _convert_to_lightning_optimizer(trainer, optimizer):
             if not isinstance(optimizer, LightningOptimizer):
                 optimizer = LightningOptimizer(optimizer)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 05977c1fc3b86..695741ed3cd22 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -449,11 +449,7 @@ def _process_result(self, training_step_output, split_batch):
         return training_step_output_for_epoch_end
 
     def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure):
-        with self.trainer.profiler.profile("optimizer_step"):
-            # optimizer step lightningModule hook
-            self.trainer.accelerator_backend.optimizer_step(
-                optimizer, self.trainer.current_epoch, batch_idx, opt_idx, train_step_and_backward_closure
-            )
+        model_ref = self.trainer.get_model()
 
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
         using_native_amp = self.trainer.amp_backend == AMPType.NATIVE
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index ab47dd0d1517f..227716d5e72c4 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+import os
 from unittest import mock
 from unittest.mock import MagicMock
 
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 7337ee1200420..429ad108f1fc6 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -26,7 +26,7 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator
+from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE

From 98660def76119fee5c7826530cf73a066977b7f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 31 Jan 2021 16:10:30 +0100
Subject: [PATCH 128/157] reset bugreportmodel

---
 pl_examples/bug_report_model.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index df82ea0c835da..4d9a23f48ca5d 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -56,23 +56,24 @@ class BoringModel(LightningModule):
     def __init__(self):
         """
         Testing PL Module
+
         Use as follows:
         - subclass
         - modify the behavior for what you want
+
         class TestModel(BaseTestModel):
             def training_step(...):
                 # do your own thing
+
         or:
+
         model = BaseTestModel()
         model.training_epoch_end = None
+
         """
         super().__init__()
         self.layer = torch.nn.Linear(32, 2)
 
-    @property
-    def automatic_optimization(self):
-        return True
-
     def forward(self, x):
         return self.layer(x)
 
@@ -81,7 +82,7 @@ def loss(self, batch, prediction):
         return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
 
     def step(self, x):
-        x = self(x)
+        x = self.layer(x)
         out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
         return out
 

From 4d95b6ce5309c2374c33931e38c0d60a2ae372b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 31 Jan 2021 16:20:38 +0100
Subject: [PATCH 129/157] unwrapping

---
 pytorch_lightning/overrides/data_parallel.py      |  4 ++--
 pytorch_lightning/plugins/training_type/ddp.py    | 15 ++++++---------
 .../plugins/training_type/ddp_spawn.py            | 15 ++++++---------
 pytorch_lightning/plugins/training_type/dp.py     |  5 +++--
 4 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py
index d45040562152a..2c38d8e03b3ee 100644
--- a/pytorch_lightning/overrides/data_parallel.py
+++ b/pytorch_lightning/overrides/data_parallel.py
@@ -27,9 +27,9 @@
 
 def unwrap_lightning_module(wrapped_model):
     model = wrapped_model
-    if isinstance(model, (LightningDistributedDataParallel, LightningDataParallel)):
+    if isinstance(model, (DistributedDataParallel, DataParallel)):
         model = model.module
-    if isinstance(model, LightningDistributedModule):
+    if isinstance(model, _LightningModuleWrapperBase):
         model = model.module
     return model
 
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 1128756780518..ed3cabd1b4fcc 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -20,11 +20,13 @@
 import numpy as np
 import torch
 import torch.distributed as torch_distrib
+from torch.nn.parallel.distributed import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides import LightningDistributedModule
+from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import (
@@ -77,10 +79,7 @@ def root_device(self):
 
     @property
     def lightning_module(self):
-        # the model may not be wrapped with DistributedDataParallel if calling this too early
-        # fixme: uncomment when this class will actually be used
-        # return unwrap_lightning_module(self._model)
-        pass
+        return unwrap_lightning_module(self._model)
 
     @property
     def distributed_sampler_kwargs(self):
@@ -184,10 +183,8 @@ def set_world_ranks(self):
         self.world_size = self.num_nodes * self.num_processes
 
     def configure_ddp(self):
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self._model = LightningDistributedDataParallel(
-            self.model,
+        self._model = DistributedDataParallel(
+            LightningDistributedModule(self.model),
             device_ids=self.determine_ddp_device_ids(),
             **self._ddp_kwargs,
         )
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 9745fd5dee9f5..5b585fd1b1c43 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -18,11 +18,13 @@
 import torch
 import torch.distributed as torch_distrib
 import torch.multiprocessing as mp
+from torch.nn.parallel.distributed import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides import LightningDistributedModule
+from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
@@ -63,10 +65,7 @@ def root_device(self):
 
     @property
     def lightning_module(self):
-        # the model may not be wrapped with DistributedDataParallel if calling this too early
-        # fixme: uncomment when this class will actually be used
-        # return unwrap_lightning_module(self._model)
-        pass
+        return unwrap_lightning_module(self._model)
 
     @property
     def distributed_sampler_kwargs(self):
@@ -155,10 +154,8 @@ def post_training(self):
         self.__recover_child_process_weights(best_path, last_path)
 
     def configure_ddp(self):
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self.model = LightningDistributedDataParallel(
-            self.model,
+        self._model = DistributedDataParallel(
+            LightningDistributedModule(self.model),
             device_ids=self.determine_ddp_device_ids(),
             **self._ddp_kwargs,
         )
diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index ce33da87048cc..363a54e53750a 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -14,9 +14,10 @@
 from typing import List
 
 import torch
+from torch.nn import DataParallel
 
 from pytorch_lightning.core.step_result import Result
-from pytorch_lightning.overrides.data_parallel import LightningDataParallel
+from pytorch_lightning.overrides import LightningParallelModule
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 
 
@@ -26,7 +27,7 @@ def __init__(self, parallel_devices: List[torch.device]):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
 
     def setup(self, model):
-        self._model = LightningDataParallel(model, self.parallel_devices)
+        self._model = DataParallel(LightningParallelModule(model), self.parallel_devices)
 
     def reduce(self, output, *args, **kwargs):
         if isinstance(output, Result):

From b69d0133f815ef90ac16ac073bd2feabc2cd6a80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 31 Jan 2021 16:36:39 +0100
Subject: [PATCH 130/157] step routing forward

---
 pytorch_lightning/accelerators/accelerator.py          |  6 +++---
 pytorch_lightning/plugins/training_type/ddp.py         |  9 +++++++++
 pytorch_lightning/plugins/training_type/ddp_spawn.py   |  9 +++++++++
 pytorch_lightning/plugins/training_type/dp.py          | 10 ++++++++++
 pytorch_lightning/plugins/training_type/sharded.py     |  9 +++++++++
 .../plugins/training_type/sharded_spawn.py             |  9 +++++++++
 .../plugins/training_type/training_type_plugin.py      |  9 +++++++++
 7 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 8dabd4ed7cf75..47b8f03c600d4 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -133,7 +133,7 @@ def training_step(self, args):
 
         with self.precision_plugin.train_step_context():
             with self.training_type_plugin.train_step_context():
-                return self.lightning_module.training_step(*args)
+                return self.training_type_plugin.training_step(*args)
 
     def validation_step(self, args):
         """The actual validation step.
@@ -152,7 +152,7 @@ def validation_step(self, args):
 
         with self.precision_plugin.val_step_context():
             with self.training_type_plugin.val_step_context():
-                return self.lightning_module.validation_step(*args)
+                return self.training_type_plugin.validation_step(*args)
 
     def test_step(self, args):
         """The actual test step.
@@ -171,7 +171,7 @@ def test_step(self, args):
 
         with self.precision_plugin.test_step_context():
             with self.training_type_plugin.test_step_context():
-                return self.lightning_module.test_step(*args)
+                return self.training_type_plugin.test_step(*args)
 
     def training_step_end(self, output):
         """A hook to do something at the end of the training step
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index ed3cabd1b4fcc..1ee9f8d58089e 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -267,3 +267,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
         if isinstance(output, torch.Tensor):
             output = sync_ddp_if_available(output, group, reduce_op)
         return output
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 5b585fd1b1c43..cb5e4e0cabba5 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -223,3 +223,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
         if isinstance(output, torch.Tensor):
             output = sync_ddp_if_available(output, group, reduce_op)
         return output
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index 363a54e53750a..f1fcdbe02831d 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -58,3 +58,13 @@ def broadcast(self, obj: object, src: int = 0) -> object:
 
     def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
         return should_stop
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py
index 16570492a0dc8..115b1fb0676dc 100644
--- a/pytorch_lightning/plugins/training_type/sharded.py
+++ b/pytorch_lightning/plugins/training_type/sharded.py
@@ -52,3 +52,12 @@ def _optim_state_dict(self, optimizer):
         :meth:`consolidate_state_dict`.
         """
         return optimizer.state_dict()
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py
index 503e78e13618c..8be72f2e52d24 100644
--- a/pytorch_lightning/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py
@@ -54,3 +54,12 @@ def _optim_state_dict(self, optimizer):
         :meth:`consolidate_state_dict`.
         """
         return optimizer.state_dict()
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index d1e7907d5d97f..78c14d153e576 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -110,3 +110,12 @@ def start_training(self, trainer: "Trainer") -> None:
     def start_testing(self, trainer: "Trainer") -> None:
         # double dispatch to initiate the test loop
         self._results = trainer.run_test()
+
+    def training_step(self, *args, **kwargs):
+        return self.lightning_module.training_step(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.lightning_module.validation_step(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.lightning_module.test_step(*args, **kwargs)

From cb6676d4710101e7951f436eac91dcb0a3eb611b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 31 Jan 2021 16:48:17 +0100
Subject: [PATCH 131/157] model access

---
 pytorch_lightning/plugins/training_type/dp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index f1fcdbe02831d..fc08080399441 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -44,7 +44,7 @@ def root_device(self):
 
     @property
     def lightning_module(self):
-        return self._model.module
+        return getattr(self._model, "module", None)
 
     def model_to_device(self):
         # no need to do anything when model is wrapped in torch.nn.DataParallel

From a33d27fc6809a4c44b74914fd3d3b9992643493e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 31 Jan 2021 16:49:31 +0100
Subject: [PATCH 132/157] unwrap

---
 pytorch_lightning/plugins/training_type/dp.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index fc08080399441..cc4b3e2584efc 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -18,6 +18,7 @@
 
 from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.overrides import LightningParallelModule
+from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 
 
@@ -44,7 +45,7 @@ def root_device(self):
 
     @property
     def lightning_module(self):
-        return getattr(self._model, "module", None)
+        return unwrap_lightning_module(self.model)
 
     def model_to_device(self):
         # no need to do anything when model is wrapped in torch.nn.DataParallel

From f7486e2384e8b139dd67ae827d33814bc099948b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 31 Jan 2021 16:53:37 +0100
Subject: [PATCH 133/157] opt

---
 pytorch_lightning/plugins/training_type/horovod.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
index 434eb2f09c1db..f45c3dcb93bb6 100644
--- a/pytorch_lightning/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -85,6 +85,7 @@ def _filter_named_parameters(model, optimizer):
                 optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer)
             ) for optimizer in optimizers
         ]
+        self.lightning_module.trainer.accelerator.optimizers = optimizers
 
     def start_training(self, trainer):
         with ExitStack() as stack:

From 3792b72bb714286726c57f094fbd79c9296624a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 00:15:18 +0100
Subject: [PATCH 134/157] integrate distrib_type

---
 .../accelerators/accelerator_connector.py     | 156 +++++++++---------
 pytorch_lightning/plugins/training_type/dp.py |   1 -
 pytorch_lightning/trainer/properties.py       |  12 +-
 pytorch_lightning/trainer/trainer.py          |   2 -
 4 files changed, 89 insertions(+), 82 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index baf14c4146aed..d0ed8878c1917 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -27,7 +27,8 @@
 from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
 from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
-from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser
+from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser, DeviceType, \
+    DistributedType, _TPU_AVAILABLE
 from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -65,13 +66,9 @@ def __init__(
         amp_level,
         cluster_environment,
     ):
-
         # initialization
-        self.use_dp = False
-        self.use_ddp = False
-        self.use_ddp2 = False
-        self.use_horovod = False
-        self.use_single_gpu = False
+        self._device_type = DeviceType.CPU
+        self._distrib_type = None
 
         self.num_processes = num_processes
         self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
@@ -137,6 +134,10 @@ def __init__(
 
         self.replace_sampler_ddp = replace_sampler_ddp
 
+    @property
+    def on_cpu(self):
+        return self._device_type == DeviceType.CPU
+
     @property
     def on_tpu(self):
         return self.tpu_cores is not None
@@ -153,6 +154,22 @@ def on_gpu(self):
         gpus = self.parallel_device_ids
         return gpus is not None and len(gpus) > 0 and torch.cuda.is_available()
 
+    @property
+    def use_dp(self):
+        return self._distrib_type == DistributedType.DP
+
+    @property
+    def use_ddp(self):
+        return self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
+
+    @property
+    def use_ddp2(self):
+        return self._distrib_type == DistributedType.DDP2
+
+    @property
+    def use_horovod(self):
+        return self._distrib_type == DistributedType.HOROVOD
+
     @property
     def num_gpus(self) -> int:
         gpus = self.parallel_device_ids
@@ -220,8 +237,8 @@ def select_training_type_plugin(self):
         elif self.use_ddp:
             use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
             use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic
-            use_ddp_spawn = self.use_ddp and self.distributed_backend == "ddp_spawn"
-            use_ddp_cpu_spawn = self.use_ddp and self.distributed_backend == "ddp_cpu"
+            use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
+            use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
             use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
             use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
             use_ddp_sharded = self.distributed_backend == "ddp_sharded"
@@ -288,96 +305,85 @@ def select_cluster_environment(self):
         return env
 
     def set_distributed_mode(self):
-        # No distributed backend
+
         if self.distributed_backend is None:
-            # horovod multi GPU
             if self.has_horovodrun():
                 self._set_horovod_backend()
-
-            # DDP CPU
-            elif self.num_gpus == 0:
-                if self.num_nodes > 1 or self.num_processes > 1:
-                    self.use_ddp = True
-
-            # Single GPU
-            elif self.num_gpus == 1:
-                self.use_single_gpu = True
-
-            # Default: DDP-Spawn
+            elif self.num_gpus == 0 and (self.num_nodes > 1 or self.num_processes > 1):
+                self._distrib_type = DistributedType.DDP
             elif self.num_gpus > 1:
                 rank_zero_warn(
-                    "You requested multiple GPUs but did not specify a backend, e.g."
-                    ' (distributed_backend="dp"|"ddp"|"ddp2").'
-                    ' Setting distributed_backend="ddp_spawn" for you.'
+                    'You requested multiple GPUs but did not specify a backend, e.g.'
+                    ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.'
                 )
                 self.distributed_backend = "ddp_spawn"
 
-        # DP
-        if self.distributed_backend == "dp":
-            # do nothing if num_gpus == 0
-            if self.num_gpus == 1:
-                self.use_single_gpu = True
-                self.use_dp = True
-            elif self.num_gpus > 1:
-                self.use_dp = True
-
-        # DDP, DDP-Spawn
-        elif self.distributed_backend in ("ddp", "ddp_spawn"):
-            if self.num_gpus == 0:
-                # DDP CPU
-                if self.num_nodes > 1 or self.num_processes > 1:
-                    self.use_ddp = True
-
-            # DDP Single GPU
-            elif self.num_gpus == 1:
-                self.use_single_gpu = True
-                self.use_ddp = True
-
-            # DDP Multi GPU
-            elif self.num_gpus > 1:
-                self.use_ddp = True
-                self.num_processes = self.num_gpus
-
-        # DDP2
-        elif self.distributed_backend == "ddp2":
-            # do nothing if num_gpus == 0
-            if self.num_gpus >= 1:
-                self.use_ddp2 = True
-
-        # DDP CPU
-        elif self.distributed_backend == "ddp_cpu":
+        # special case with DDP on CPUs
+        if self.distributed_backend == "ddp_cpu":
+            self._distrib_type = DistributedType.DDP
+            self.data_parallel_device_ids = None
             if self.num_gpus > 0:
                 rank_zero_warn(
-                    "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs."
+                    'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                 )
-            self.parallel_device_ids = None
-            self.use_ddp = True
+            if self.num_processes is None:
+                # define the max CPU available
+                self.num_processes = os.cpu_count()
+        # special case with TPUs
+        elif self.distributed_backend == 'tpu':
+            self._device_type = DeviceType.TPU
+        # set all other requested distrib. types adn if it was not set in the
+        elif self.distributed_backend and self._distrib_type is None:
+            self._distrib_type = DistributedType(self.distributed_backend)
+
+        # unless you request explicitly for CPU and some GPU are available use them
+        _on_cpu = self.distributed_backend and 'cpu' in self.distributed_backend
+        if (self.num_gpus > 0 and not _on_cpu):
+            self._device_type = DeviceType.GPU
+
+        _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
+        # DP and DDP2 cannot run without GPU
+        if (self.num_gpus == 0 and self._distrib_type in _distrib_types):
+            rank_zero_warn(
+                'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
+            )
+            # todo: in some cases it yield in comarison None and int
+            if ((self.num_nodes and self.num_nodes > 1)
+                    or (self.num_processes and self.num_processes > 1)):
+                self._distrib_type = DistributedType.DDP
+            else:
+                rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
+                self._distrib_type = None
 
-        # Sharded DDP
-        elif self.distributed_backend in ("ddp_sharded", "ddp_sharded_spawn"):
-            self.use_ddp = True
+        # for DDP overwrite nb processes by requested GPUs
+        if (self._device_type == DeviceType.GPU
+                and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)):
+            self.num_processes = self.num_gpus
 
-        # HOROVOD
-        elif self.distributed_backend == "horovod":
+        # Horovod si an extra case...
+        if self.distributed_backend == "horovod":
             self._set_horovod_backend()
 
         # throw error to force user ddp or ddp2 choice
-        if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp):
+        _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
+        if (self.num_nodes > 1 and self._distrib_type not in _ddp):
             raise MisconfigurationException(
-                "DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. "
-                "To silence this warning set distributed_backend=ddp or distributed_backend=ddp2"
+                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
+                'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
             )
 
-        rank_zero_info(f"GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}")
+        rank_zero_info(
+            f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}'
+        )
         num_cores = self.tpu_cores if self.tpu_cores is not None else 0
-        rank_zero_info(f"TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores")
+        rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores')
 
-        if torch.cuda.is_available() and not self.on_gpu:
-            rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.")
+        if torch.cuda.is_available() and self._device_type != DeviceType.GPU:
+            rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.')
 
     def _set_horovod_backend(self):
         self.check_horovod()
-        self.use_horovod = True
+        self._distrib_type = DistributedType.HOROVOD
 
         # Initialize Horovod to get rank / size info
         hvd.init()
diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index cc4b3e2584efc..d16a25c52e6bc 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -68,4 +68,3 @@ def validation_step(self, *args, **kwargs):
 
     def test_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
-
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index db58e5a4815a0..81777530723fe 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -133,16 +133,20 @@ def use_ddp2(self):
     def use_horovod(self):
         return self.accelerator_connector.use_horovod
 
-    @property
-    def use_single_gpu(self):
-        return self.accelerator_connector.use_single_gpu
-
     @property
     def use_tpu(self):
         # TODO update this, what is the difference between use_tpu and on_tpu?
         return False
         # return self.accelerator_connector.use_tpu
 
+    @property
+    def _distrib_type(self):
+        return self.accelerator_connector._distrib_type
+
+    @property
+    def _device_type(self):
+        return self.accelerator_connector._device_type
+
     @property
     def num_nodes(self):
         return self.accelerator_connector.num_nodes
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 5344a98fdb73f..c404adadd8117 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -296,8 +296,6 @@ def __init__(
                 reload when reaching the minimum length of datasets.
         """
         super().__init__()
-        self._device_type = DeviceType.CPU
-        self._distrib_type = None
         self._running_stage = None
         self._predicting = False
 

From ef85b812b3a3352390e0366ce6f9a9c11c969c51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 00:54:38 +0100
Subject: [PATCH 135/157] sync changes

---
 .../accelerators/accelerator_connector.py     | 149 +++++++++++-------
 1 file changed, 91 insertions(+), 58 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index d0ed8878c1917..94f98e1f65521 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -20,21 +20,43 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
-from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \
-    PrecisionPlugin, ShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \
-    DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin
+from pytorch_lightning.accelerators.tpu import TPUAccelerator
 from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
 from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
+from pytorch_lightning.plugins import (
+    ApexMixedPrecisionPlugin,
+    DataParallelPlugin,
+    DDP2Plugin,
+    DDPPlugin,
+    DDPShardedPlugin,
+    DDPSpawnPlugin,
+    DDPSpawnShardedPlugin,
+    HorovodPlugin,
+    NativeMixedPrecisionPlugin,
+    PrecisionPlugin,
+    ShardedNativeMixedPrecisionPlugin,
+    SingleDevicePlugin,
+    SingleTPUPlugin,
+    TPUHalfPrecisionPlugin,
+    TPUSpawnPlugin,
+)
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
-from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser, DeviceType, \
-    DistributedType, _TPU_AVAILABLE
-from pytorch_lightning.utilities import rank_zero_only
-from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info
+from pytorch_lightning.utilities import (
+    _APEX_AVAILABLE,
+    _NATIVE_AMP_AVAILABLE,
+    _TPU_AVAILABLE,
+    AMPType,
+    device_parser,
+    DeviceType,
+    DistributedType,
+    rank_zero_only,
+)
+from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 try:
     import torch_xla
+    import torch_xla.core.xla_model as xm
 except ImportError:
     XLA_AVAILABLE = False
 else:
@@ -49,22 +71,23 @@
 
 
 class BackendConnector(object):
+
     def __init__(
-        self,
-        num_processes,
-        tpu_cores,
-        distributed_backend,
-        auto_select_gpus,
-        gpus,
-        num_nodes,
-        sync_batchnorm,
-        benchmark,
-        replace_sampler_ddp,
-        deterministic,
-        precision,
-        amp_type, 
-        amp_level,
-        cluster_environment,
+            self,
+            num_processes,
+            tpu_cores,
+            distributed_backend,
+            auto_select_gpus,
+            gpus,
+            num_nodes,
+            sync_batchnorm,
+            benchmark,
+            replace_sampler_ddp,
+            deterministic,
+            precision,
+            amp_type,
+            amp_level,
+            cluster_environment,
     ):
         # initialization
         self._device_type = DeviceType.CPU
@@ -182,14 +205,14 @@ def parallel_devices(self):
         if self.on_gpu:
             devices = [torch.device("cuda", i) for i in self.parallel_device_ids]
         elif self.on_tpu:
-            raise NotImplementedError
+            devices = [xm.xla_device(i) for i in self.parallel_device_ids]
         else:
             devices = [torch.device("cpu")] * self.num_processes
         return devices
 
     @property
     def is_using_torchelastic(self):
-        te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
+        te_flags_passed = "WORLD_SIZE" in os.environ and ("GROUP_RANK" in os.environ or "NODE_RANK" in os.environ)
         return te_flags_passed
 
     def select_precision_plugin(self):
@@ -198,42 +221,46 @@ def select_precision_plugin(self):
             return PrecisionPlugin()
 
         elif self.precision == 16:
-            if self.amp_type == 'native':
+            if self.on_tpu:
+                return TPUHalfPrecisionPlugin()
+
+            if self.amp_type == "native":
                 if not _NATIVE_AMP_AVAILABLE:
-                    rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.'
-                                ' Consider upgrading with `pip install torch>=1.6`.'
-                                ' We will attempt to use NVIDIA Apex for this session.')
-                    self.amp_type = 'apex'
+                    rank_zero_warn(
+                        "You have asked for native AMP but your PyTorch version does not support it."
+                        " Consider upgrading with `pip install torch>=1.6`."
+                        " We will attempt to use NVIDIA Apex for this session."
+                    )
+                    self.amp_type = "apex"
                 else:
-                    log.info('Using native 16bit precision.')
-                    if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn':
+                    log.info("Using native 16bit precision.")
+                    if self.distributed_backend == "ddp_sharded" or self.distributed_backend == "ddp_sharded_spawn":
                         return ShardedNativeMixedPrecisionPlugin()
                     self.amp_type = AMPType.NATIVE
                     return NativeMixedPrecisionPlugin()
 
-            if self.amp_type == 'apex':
+            if self.amp_type == "apex":
                 if not _APEX_AVAILABLE:
-                    rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.'
-                                   ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux')
+                    rank_zero_warn(
+                        "You have asked for Apex AMP but you have not installed it yet."
+                        " Install apex first using this guide: https://github.com/NVIDIA/apex#linux"
+                    )
                 else:
-                    if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn':
+                    if self.distributed_backend == "ddp_sharded" or self.distributed_backend == "ddp_sharded_spawn":
                         raise MisconfigurationException(
-                            'Sharded Plugin is not supported with Apex AMP, '
-                            'please using native AMP for 16-bit precision.'
+                            "Sharded Plugin is not supported with Apex AMP, "
+                            "please using native AMP for 16-bit precision."
                         )
-                    log.info('Using APEX 16bit precision.')
+                    log.info("Using APEX 16bit precision.")
                     self.amp_type = AMPType.APEX
                     return ApexMixedPrecisionPlugin(self.amp_level)
         else:
-            raise NotImplementedError('We only support precisions 32 and 16!')
+            raise NotImplementedError("We only support precisions 32 and 16!")
 
     def select_training_type_plugin(self):
         cluster_environment = self.select_cluster_environment()
         if self.use_ddp2:
-            plugin = DDP2Plugin(
-                parallel_devices=self.parallel_devices,
-                cluster_environment=cluster_environment
-            )
+            plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=cluster_environment)
         elif self.use_ddp:
             use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
             use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic
@@ -244,9 +271,12 @@ def select_training_type_plugin(self):
             use_ddp_sharded = self.distributed_backend == "ddp_sharded"
             use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn"
 
+            if self.on_tpu:
+                ddp_plugin_cls = TPUSpawnPlugin
+
             # ddp script mode uses the same flags as TE
             # TODO: decouple from TE
-            if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
+            if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                 use_torchelastic_ddp = False
 
             if use_ddp_sharded:
@@ -270,6 +300,8 @@ def select_training_type_plugin(self):
             plugin = DataParallelPlugin(parallel_devices=self.parallel_devices)
         elif self.use_horovod:
             plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
+        elif self.on_tpu:
+            plugin = SingleTPUPlugin(self.tpu_id)
         else:
             plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
         return plugin
@@ -281,6 +313,8 @@ def select_accelerator(self):
 
         if self.on_gpu:
             acc_cls = GPUAccelerator
+        elif self.on_tpu:
+            acc_cls = TPUAccelerator
         else:
             acc_cls = CPUAccelerator
 
@@ -348,16 +382,17 @@ def set_distributed_mode(self):
                 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
             )
             # todo: in some cases it yield in comarison None and int
-            if ((self.num_nodes and self.num_nodes > 1)
-                    or (self.num_processes and self.num_processes > 1)):
+            if ((self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1)):
                 self._distrib_type = DistributedType.DDP
             else:
                 rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
                 self._distrib_type = None
 
         # for DDP overwrite nb processes by requested GPUs
-        if (self._device_type == DeviceType.GPU
-                and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)):
+        if (
+                self._device_type == DeviceType.GPU
+                and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
+        ):
             self.num_processes = self.num_gpus
 
         # Horovod si an extra case...
@@ -372,14 +407,12 @@ def set_distributed_mode(self):
                 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
             )
 
-        rank_zero_info(
-            f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}'
-        )
+        rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')
         num_cores = self.tpu_cores if self.tpu_cores is not None else 0
         rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores')
 
         if torch.cuda.is_available() and self._device_type != DeviceType.GPU:
-            rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.')
+            rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.")
 
     def _set_horovod_backend(self):
         self.check_horovod()
@@ -421,7 +454,7 @@ def configure_slurm_ddp(self):
             num_requested_gpus = self.num_gpus * self.num_nodes
             num_slurm_tasks = 0
             try:
-                num_slurm_tasks = int(os.environ['SLURM_NTASKS'])
+                num_slurm_tasks = int(os.environ["SLURM_NTASKS"])
                 self.is_slurm_managing_tasks = num_slurm_tasks == num_requested_gpus
 
                 # enable slurm cpu
@@ -429,8 +462,8 @@ def configure_slurm_ddp(self):
                     self.is_slurm_managing_tasks = num_slurm_tasks == self.num_processes
 
                 # in interactive mode we don't manage tasks
-                job_name = os.environ['SLURM_JOB_NAME']
-                if job_name == 'bash':
+                job_name = os.environ["SLURM_JOB_NAME"]
+                if job_name == "bash":
                     self.is_slurm_managing_tasks = False
 
             except Exception:
@@ -439,7 +472,7 @@ def configure_slurm_ddp(self):
 
         # used for tests only, set this flag to simulate slurm managing a task
         try:
-            should_fake = int(os.environ['FAKE_SLURM_MANAGING_TASKS'])
+            should_fake = int(os.environ["FAKE_SLURM_MANAGING_TASKS"])
             if should_fake:
                 self.is_slurm_managing_tasks = True
         except Exception:
@@ -447,4 +480,4 @@ def configure_slurm_ddp(self):
 
         # notify user the that slurm is managing tasks
         if self.is_slurm_managing_tasks:
-            rank_zero_info('Multi-processing is handled by Slurm.')
+            rank_zero_info("Multi-processing is handled by Slurm.")

From 9d9a9409ae836b9f9413914d8a1072cebc7d9025 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 01:35:53 +0100
Subject: [PATCH 136/157] sync

---
 pytorch_lightning/accelerators/accelerator_connector.py    | 6 ++----
 pytorch_lightning/trainer/connectors/env_vars_connector.py | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 94f98e1f65521..589843064bd3b 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -44,7 +44,6 @@
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
     _NATIVE_AMP_AVAILABLE,
-    _TPU_AVAILABLE,
     AMPType,
     device_parser,
     DeviceType,
@@ -55,7 +54,6 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 try:
-    import torch_xla
     import torch_xla.core.xla_model as xm
 except ImportError:
     XLA_AVAILABLE = False
@@ -395,7 +393,7 @@ def set_distributed_mode(self):
         ):
             self.num_processes = self.num_gpus
 
-        # Horovod si an extra case...
+        # Horovod is an extra case...
         if self.distributed_backend == "horovod":
             self._set_horovod_backend()
 
@@ -409,7 +407,7 @@ def set_distributed_mode(self):
 
         rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')
         num_cores = self.tpu_cores if self.tpu_cores is not None else 0
-        rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores')
+        rank_zero_info(f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores')
 
         if torch.cuda.is_available() and self._device_type != DeviceType.GPU:
             rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.")
diff --git a/pytorch_lightning/trainer/connectors/env_vars_connector.py b/pytorch_lightning/trainer/connectors/env_vars_connector.py
index 6b907d288c5ca..e4d5670b5fe78 100644
--- a/pytorch_lightning/trainer/connectors/env_vars_connector.py
+++ b/pytorch_lightning/trainer/connectors/env_vars_connector.py
@@ -28,7 +28,6 @@ def overwrite_by_env_vars(fn: Callable) -> Callable:
     def overwrite_by_env_vars(self, *args, **kwargs):
         # get the class
         cls = self.__class__
-
         if args:  # inace any args passed move them to kwargs
             # parse only the argument names
             cls_arg_names = [arg[0] for arg in get_init_arguments_and_types(cls)]

From a190a565619f3beb72735e0ce19db78beb138409 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 14:48:44 +0100
Subject: [PATCH 137/157] fixes

---
 pytorch_lightning/plugins/__init__.py | 1 +
 tests/models/test_amp.py              | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 9f748996a707d..1a8b5090a346b 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -4,6 +4,7 @@
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.tpu_bfloat import TPUHalfPrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin  # noqa: F401
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index f9c502bf3ce7e..94bfd6808ed79 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -20,8 +20,7 @@
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.cluster_environments import SLURMEnvironment
-from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException

From 73bb60787383907ee5aa87985debf74ff70051e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 14:51:52 +0100
Subject: [PATCH 138/157] add forgotten generators

---
 pytorch_lightning/plugins/base_plugin.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_lightning/plugins/base_plugin.py b/pytorch_lightning/plugins/base_plugin.py
index c325518e4c8ff..b316a8663f9ff 100644
--- a/pytorch_lightning/plugins/base_plugin.py
+++ b/pytorch_lightning/plugins/base_plugin.py
@@ -43,11 +43,14 @@ def post_training(self) -> None:
     @contextlib.contextmanager
     def train_step_context(self) -> Generator:
         """A contextmanager for the trainstep"""
+        yield
 
     @contextlib.contextmanager
     def val_step_context(self) -> Generator:
         """A contextmanager for the validation step"""
+        yield
 
     @contextlib.contextmanager
     def test_step_context(self) -> Generator:
         """A contextmanager for the teststep"""
+        yield

From ae71997dac15d560a45c07d1bf891f9409c9d777 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 15:27:04 +0100
Subject: [PATCH 139/157] add missing logic

---
 pytorch_lightning/accelerators/accelerator.py |  7 +++----
 pytorch_lightning/overrides/data_parallel.py  |  9 ++++++++
 pytorch_lightning/plugins/__init__.py         |  1 +
 pytorch_lightning/plugins/base_plugin.py      |  3 +++
 .../plugins/training_type/__init__.py         | 11 ++++++++++
 .../plugins/training_type/ddp.py              | 21 ++++++++++++-------
 .../plugins/training_type/ddp_spawn.py        | 21 ++++++++++++-------
 .../plugins/training_type/horovod.py          |  4 +---
 .../training_type/training_type_plugin.py     |  9 ++++++++
 9 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 576c8279376ea..e26dc8b476ab2 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -73,7 +73,6 @@ def setup(self, trainer: "Trainer", model: LightningModule) -> None:
         self.connect_training_type_plugin(self.training_type_plugin, model)
         self.setup_optimizers(trainer, model)
         self.connect_precision_plugin(self.precision_plugin)
-        self.optimizers = trainer.convert_to_lightning_optimizers(self.optimizers)
 
     @property
     def model(self) -> torch.nn.Module:
@@ -141,7 +140,7 @@ def training_step(self, args):
 
         with self.precision_plugin.train_step_context():
             with self.training_type_plugin.train_step_context():
-                return self.lightning_module.training_step(*args)
+                return self.training_type_plugin.training_step(*args)
 
     def validation_step(self, args):
         """The actual validation step.
@@ -160,7 +159,7 @@ def validation_step(self, args):
 
         with self.precision_plugin.val_step_context():
             with self.training_type_plugin.val_step_context():
-                return self.lightning_module.validation_step(*args)
+                return self.training_type_plugin.validation_step(*args)
 
     def test_step(self, args):
         """The actual test step.
@@ -179,7 +178,7 @@ def test_step(self, args):
 
         with self.precision_plugin.test_step_context():
             with self.training_type_plugin.test_step_context():
-                return self.lightning_module.test_step(*args)
+                return self.training_type_plugin.test_step(*args)
 
     def training_step_end(self, output):
         """A hook to do something at the end of the training step
diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py
index b027502f99e8a..28840cd51faf6 100644
--- a/pytorch_lightning/overrides/data_parallel.py
+++ b/pytorch_lightning/overrides/data_parallel.py
@@ -25,6 +25,15 @@
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 
+def unwrap_lightning_module(wrapped_model) -> LightningModule:
+    model = wrapped_model
+    if isinstance(model, (DistributedDataParallel, DataParallel)):
+        model = model.module
+    if isinstance(model, _LightningModuleWrapperBase):
+        model = model.module
+    return model
+
+
 class LightningDataParallel(DataParallel):
 
     def __init__(self, module: LightningModule, *args, **kwargs):
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 9f748996a707d..0990b547907e7 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -9,6 +9,7 @@
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin  # noqa: F401
diff --git a/pytorch_lightning/plugins/base_plugin.py b/pytorch_lightning/plugins/base_plugin.py
index c325518e4c8ff..b316a8663f9ff 100644
--- a/pytorch_lightning/plugins/base_plugin.py
+++ b/pytorch_lightning/plugins/base_plugin.py
@@ -43,11 +43,14 @@ def post_training(self) -> None:
     @contextlib.contextmanager
     def train_step_context(self) -> Generator:
         """A contextmanager for the trainstep"""
+        yield
 
     @contextlib.contextmanager
     def val_step_context(self) -> Generator:
         """A contextmanager for the validation step"""
+        yield
 
     @contextlib.contextmanager
     def test_step_context(self) -> Generator:
         """A contextmanager for the teststep"""
+        yield
diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
index 329f6347b17c3..21dec5bc5ccda 100644
--- a/pytorch_lightning/plugins/training_type/__init__.py
+++ b/pytorch_lightning/plugins/training_type/__init__.py
@@ -1 +1,12 @@
+from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin
+from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
+from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
+from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin
+from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin
+from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
+from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
+from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
+from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index f1027efb418ba..c133e0e68bc93 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -20,6 +20,7 @@
 import numpy as np
 import torch
 import torch.distributed as torch_distrib
+from torch.nn.parallel.distributed import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed import LightningDistributed
@@ -77,10 +78,7 @@ def root_device(self):
 
     @property
     def lightning_module(self):
-        # the model may not be wrapped with DistributedDataParallel if calling this too early
-        # fixme: uncomment when this class will actually be used
-        # return unwrap_lightning_module(self._model)
-        pass
+        return unwrap_lightning_module(self._model)
 
     @property
     def distributed_sampler_kwargs(self):
@@ -184,10 +182,8 @@ def set_world_ranks(self):
         self.world_size = self.num_nodes * self.num_processes
 
     def configure_ddp(self):
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self._model = LightningDistributedDataParallel(
-            self.model,
+        self._model = DistributedDataParallel(
+            LightningDistributedModule(self.model),
             device_ids=self.determine_ddp_device_ids(),
             **self._ddp_kwargs,
         )
@@ -270,3 +266,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
         if isinstance(output, torch.Tensor):
             output = sync_ddp_if_available(output, group, reduce_op)
         return output
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 99fd2d5ea3c61..fd4fc9219196a 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -18,6 +18,7 @@
 import torch
 import torch.distributed as torch_distrib
 import torch.multiprocessing as mp
+from torch.nn.parallel.distributed import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed.dist import LightningDistributed
@@ -63,10 +64,7 @@ def root_device(self):
 
     @property
     def lightning_module(self):
-        # the model may not be wrapped with DistributedDataParallel if calling this too early
-        # fixme: uncomment when this class will actually be used
-        # return unwrap_lightning_module(self._model)
-        pass
+        return unwrap_lightning_module(self._model)
 
     @property
     def distributed_sampler_kwargs(self):
@@ -155,10 +153,8 @@ def post_training(self):
         self.__recover_child_process_weights(best_path, last_path)
 
     def configure_ddp(self):
-        # if unset, default `find_unused_parameters` `True`
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        self.model = LightningDistributedDataParallel(
-            self.model,
+        self._model = DistributedDataParallel(
+            LightningDistributedModule(self.model),
             device_ids=self.determine_ddp_device_ids(),
             **self._ddp_kwargs,
         )
@@ -226,3 +222,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[
         if isinstance(output, torch.Tensor):
             output = sync_ddp_if_available(output, group, reduce_op)
         return output
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
index a8bd0091eef6d..f45c3dcb93bb6 100644
--- a/pytorch_lightning/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -85,9 +85,7 @@ def _filter_named_parameters(model, optimizer):
                 optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer)
             ) for optimizer in optimizers
         ]
-
-        optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers)
-        self.lightning_module.trainer.optimizers = optimizers
+        self.lightning_module.trainer.accelerator.optimizers = optimizers
 
     def start_training(self, trainer):
         with ExitStack() as stack:
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 5dbbf23881373..89f2329512e5e 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -114,3 +114,12 @@ def start_training(self, trainer: 'Trainer') -> None:
     def start_testing(self, trainer: 'Trainer') -> None:
         # double dispatch to initiate the test loop
         self._results = trainer.run_test()
+
+    def training_step(self, *args, **kwargs):
+        return self.lightning_module.training_step(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.lightning_module.validation_step(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.lightning_module.test_step(*args, **kwargs)

From 0e686c315048c282d7bdb1d579506515e1921da4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 15:43:55 +0100
Subject: [PATCH 140/157] update

---
 pytorch_lightning/plugins/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 1a8b5090a346b..6a70ee62c9722 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -13,6 +13,8 @@
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin  # noqa: F401
 
 __all__ = [

From d6a43eab8685cc4fd3583ddddb03d81d4b50494a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 15:55:55 +0100
Subject: [PATCH 141/157] import

---
 pytorch_lightning/plugins/training_type/ddp.py       | 2 +-
 pytorch_lightning/plugins/training_type/ddp_spawn.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index c133e0e68bc93..84b70662c1f48 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -24,7 +24,7 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index fd4fc9219196a..45640524e1d99 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -22,7 +22,7 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities.cloud_io import atomic_save

From ceb8f75dc05a6d98206cc0e4ac84d5afee2f5669 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 15:59:57 +0100
Subject: [PATCH 142/157] missed imports

---
 pytorch_lightning/plugins/training_type/ddp.py       | 3 ++-
 pytorch_lightning/plugins/training_type/ddp_spawn.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index c133e0e68bc93..ffbebe8178697 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -24,7 +24,8 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides import LightningDistributedModule
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index fd4fc9219196a..425becfbb8d9d 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -22,7 +22,8 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.overrides import LightningDistributedModule
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities.cloud_io import atomic_save

From fbb7c20e86df44574c1316169dfcf98c20933e7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 16:02:05 +0100
Subject: [PATCH 143/157] import fixes

---
 pytorch_lightning/plugins/training_type/rpc.py            | 2 +-
 pytorch_lightning/plugins/training_type/rpc_sequential.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index 3bd0ba913d0b1..5b48f0e9d02e9 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
 from pytorch_lightning.utilities import _RPC_AVAILABLE
diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py
index 9bf2f6dbc77c3..4ab6cc22e3760 100644
--- a/pytorch_lightning/plugins/training_type/rpc_sequential.py
+++ b/pytorch_lightning/plugins/training_type/rpc_sequential.py
@@ -21,7 +21,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import LightningModule
-from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin
 from pytorch_lightning.trainer.states import RunningStage

From b61099905309d30cc7b017b5a64fea0ea8fa7982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 16:05:00 +0100
Subject: [PATCH 144/157] isort

---
 pytorch_lightning/plugins/training_type/rpc.py            | 2 +-
 pytorch_lightning/plugins/training_type/rpc_sequential.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index 5b48f0e9d02e9..4aff83189b6bc 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -17,8 +17,8 @@
 
 import torch
 
-from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
 from pytorch_lightning.utilities import _RPC_AVAILABLE
 
diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py
index 4ab6cc22e3760..baff4289c75a1 100644
--- a/pytorch_lightning/plugins/training_type/rpc_sequential.py
+++ b/pytorch_lightning/plugins/training_type/rpc_sequential.py
@@ -21,8 +21,8 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import LightningModule
-from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only

From 9b799247dd372488458952dfa03dc25f72ac8ce6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 16:07:46 +0100
Subject: [PATCH 145/157] mv f

---
 pytorch_lightning/overrides/base.py                  | 11 +++++++++++
 pytorch_lightning/overrides/data_parallel.py         |  9 ---------
 pytorch_lightning/plugins/training_type/ddp.py       |  3 ++-
 pytorch_lightning/plugins/training_type/ddp_spawn.py |  3 ++-
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py
index b2ad5b7d710fe..3dd20f6d4303b 100644
--- a/pytorch_lightning/overrides/base.py
+++ b/pytorch_lightning/overrides/base.py
@@ -14,6 +14,8 @@
 from typing import Any
 
 import torch
+from torch.nn import DataParallel
+from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.trainer.states import RunningStage
@@ -61,3 +63,12 @@ def warn_if_output_is_none(output: Any, method_name: str) -> None:
     """ Warns user about which method returned None. """
     if output is None:
         warning_cache.warn(f'Your {method_name} returned None. Did you forget to return an output?')
+
+
+def unwrap_lightning_module(wrapped_model) -> LightningModule:
+    model = wrapped_model
+    if isinstance(model, (DistributedDataParallel, DataParallel)):
+        model = model.module
+    if isinstance(model, _LightningModuleWrapperBase):
+        model = model.module
+    return model
diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py
index 28840cd51faf6..b027502f99e8a 100644
--- a/pytorch_lightning/overrides/data_parallel.py
+++ b/pytorch_lightning/overrides/data_parallel.py
@@ -25,15 +25,6 @@
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 
-def unwrap_lightning_module(wrapped_model) -> LightningModule:
-    model = wrapped_model
-    if isinstance(model, (DistributedDataParallel, DataParallel)):
-        model = model.module
-    if isinstance(model, _LightningModuleWrapperBase):
-        model = model.module
-    return model
-
-
 class LightningDataParallel(DataParallel):
 
     def __init__(self, module: LightningModule, *args, **kwargs):
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index ffbebe8178697..28872f882ab8c 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -25,7 +25,8 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
+from pytorch_lightning.overrides.base import unwrap_lightning_module
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 425becfbb8d9d..5e6b251e0c373 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -23,7 +23,8 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module
+from pytorch_lightning.overrides.base import unwrap_lightning_module
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities.cloud_io import atomic_save

From 9afe54de9fe98bfa34dc725ed36685ddd18c4acc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 16:27:29 +0100
Subject: [PATCH 146/157] changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dc381b3983753..997ec482855ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -112,6 +112,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Added parallel plugins for DP, DDP, DDPSpawn, DDP2 and Horovod ([#5714](https://github.com/PyTorchLightning/pytorch-lightning/pull/5714))
     * Added new Accelerators for CPU, GPU and TPU ([#5719](https://github.com/PyTorchLightning/pytorch-lightning/pull/5719))
     * Added Plugins for TPU training ([#5719](https://github.com/PyTorchLightning/pytorch-lightning/pull/5719))
+    * Added RPC and Sharded plugins ([#5732](https://github.com/PyTorchLightning/pytorch-lightning/pull/5732))
+    * Added missing `LightningModule`-wrapper logic to new plugins and accelerator ([#5734](https://github.com/PyTorchLightning/pytorch-lightning/pull/5734))
 
 ### Deprecated
 

From ca8cb6822cff10be8376069d4972bf95bfae5916 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 16:28:22 +0100
Subject: [PATCH 147/157] format

---
 tests/core/test_lightning_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index 17d25b6c9b75a..4d36027709900 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from unittest.mock import patch, Mock
+from unittest.mock import Mock, patch
 
 import pytest
 from torch.optim import Adam, SGD

From 06337451bc976565976e12ad8c8a8b0b86506bb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 17:06:19 +0100
Subject: [PATCH 148/157] move helper to parallel plugin

---
 pytorch_lightning/plugins/training_type/ddp.py       | 6 ------
 pytorch_lightning/plugins/training_type/ddp_spawn.py | 6 ------
 pytorch_lightning/plugins/training_type/dp.py        | 9 +++------
 pytorch_lightning/plugins/training_type/parallel.py  | 5 +++++
 4 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 28872f882ab8c..bb906a2268d62 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -25,8 +25,6 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
-from pytorch_lightning.overrides.base import unwrap_lightning_module
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE
@@ -78,10 +76,6 @@ def __init__(
     def root_device(self):
         return self.parallel_devices[self.local_rank]
 
-    @property
-    def lightning_module(self):
-        return unwrap_lightning_module(self._model)
-
     @property
     def distributed_sampler_kwargs(self):
         distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 5e6b251e0c373..6f251eb36985a 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -23,8 +23,6 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
-from pytorch_lightning.overrides.base import unwrap_lightning_module
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities.cloud_io import atomic_save
@@ -64,10 +62,6 @@ def __init__(
     def root_device(self):
         return self.parallel_devices[self.local_rank]
 
-    @property
-    def lightning_module(self):
-        return unwrap_lightning_module(self._model)
-
     @property
     def distributed_sampler_kwargs(self):
         distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index ce33da87048cc..4f35b8b37ea08 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -14,9 +14,10 @@
 from typing import List
 
 import torch
+from torch.nn import DataParallel
 
 from pytorch_lightning.core.step_result import Result
-from pytorch_lightning.overrides.data_parallel import LightningDataParallel
+from pytorch_lightning.overrides.data_parallel import LightningParallelModule
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 
 
@@ -26,7 +27,7 @@ def __init__(self, parallel_devices: List[torch.device]):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
 
     def setup(self, model):
-        self._model = LightningDataParallel(model, self.parallel_devices)
+        self._model = DataParallel(LightningParallelModule(model), self.parallel_devices)
 
     def reduce(self, output, *args, **kwargs):
         if isinstance(output, Result):
@@ -41,10 +42,6 @@ def reduce(self, output, *args, **kwargs):
     def root_device(self):
         return self.parallel_devices[0]
 
-    @property
-    def lightning_module(self):
-        return self._model.module
-
     def model_to_device(self):
         # no need to do anything when model is wrapped in torch.nn.DataParallel
         pass
diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py
index af4c2e254be56..91d44fbdaa5d1 100644
--- a/pytorch_lightning/plugins/training_type/parallel.py
+++ b/pytorch_lightning/plugins/training_type/parallel.py
@@ -18,6 +18,7 @@
 import torch
 
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.overrides.base import unwrap_lightning_module
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
@@ -46,6 +47,10 @@ def root_device(self):
     def on_gpu(self):
         return self.root_device.type == "cuda" and torch.cuda.is_available()
 
+    @property
+    def lightning_module(self):
+        return unwrap_lightning_module(self._model)
+
     @abstractmethod
     def setup(self, model):
         raise NotImplementedError

From a622e0b6ce3fcfe6f64c282da0b850874b9bc93c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 17:07:41 +0100
Subject: [PATCH 149/157] d

---
 pytorch_lightning/plugins/training_type/dp.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index 4f35b8b37ea08..2bf4bbc0b4a96 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -54,3 +54,12 @@ def broadcast(self, obj: object, src: int = 0) -> object:
 
     def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
         return should_stop
+
+    def training_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.model(*args, **kwargs)

From f2758034d3ee9b07f7219025772b98a96dd56b60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 19:01:31 +0100
Subject: [PATCH 150/157] add world size

---
 pytorch_lightning/plugins/training_type/horovod.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
index f45c3dcb93bb6..335f65b3e3fbb 100644
--- a/pytorch_lightning/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -45,6 +45,7 @@ def setup(self, model):
 
         self.global_rank = hvd.rank()
         self.local_rank = hvd.local_rank()
+        self.world_size = hvd.size()
         rank_zero_only.rank = self.global_rank
 
         self.model_to_device()

From 4ae008bf7e0b0e6e4ea93f6c3a8cf6ffffcb478e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 19:20:40 +0100
Subject: [PATCH 151/157] clean up

---
 pytorch_lightning/trainer/trainer.py | 36 ----------------------------
 1 file changed, 36 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index c404adadd8117..9565db4ddf2bc 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -422,42 +422,6 @@ def __init__(
         # Callback system
         self.on_init_end()
 
-    @property
-    def optimizers(self):
-        return self.accelerator_backend.optimizers
-
-    @optimizers.setter
-    def optimizers(self, new_optims):
-        self.accelerator_backend.optimizers = new_optims
-
-    @property
-    def lr_schedulers(self):
-        return self.accelerator_backend.lr_schedulers
-
-    @lr_schedulers.setter
-    def lr_schedulers(self, new_schedulers):
-        self.accelerator_backend.lr_schedulers = new_schedulers
-
-    @property
-    def optimizer_frequencies(self):
-        return self.accelerator_backend.optimizer_frequencies
-
-    @optimizer_frequencies.setter
-    def optimizer_frequencies(self, new_freqs):
-        self.accelerator_backend.optimizer_frequencies = new_freqs
-
-    @property
-    def amp_backend(self):
-        return self.accelerator_backend.amp_backend
-
-    @property
-    def precision(self):
-        return self.accelerator_backend.precision
-
-    @property
-    def scaler(self):
-        return self.accelerator_backend.scaler
-
     def fit(
         self,
         model: LightningModule,

From d4c63086472622a5b9a00cb47dacf3c61814a543 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 1 Feb 2021 19:26:18 +0100
Subject: [PATCH 152/157] duplicate

---
 pytorch_lightning/overrides/data_parallel.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py
index 1d29cbf8081f6..b027502f99e8a 100644
--- a/pytorch_lightning/overrides/data_parallel.py
+++ b/pytorch_lightning/overrides/data_parallel.py
@@ -25,15 +25,6 @@
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 
-def unwrap_lightning_module(wrapped_model):
-    model = wrapped_model
-    if isinstance(model, (DistributedDataParallel, DataParallel)):
-        model = model.module
-    if isinstance(model, _LightningModuleWrapperBase):
-        model = model.module
-    return model
-
-
 class LightningDataParallel(DataParallel):
 
     def __init__(self, module: LightningModule, *args, **kwargs):

From 994916490e05c48ec595e309e2f336ddee9e834a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 2 Feb 2021 05:40:18 +0100
Subject: [PATCH 153/157] activate ddp_sharded and tpu

---
 .../accelerators/accelerator_connector.py     | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index b6c60bb1a7eee..6e3cc9d57b704 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -34,7 +34,7 @@
     SingleDevicePlugin,
     SingleTPUPlugin,
     TPUHalfPrecisionPlugin,
-    TPUSpawnPlugin,
+    TPUSpawnPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin,
 )
 from pytorch_lightning.plugins.environments import SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
@@ -256,23 +256,21 @@ def select_training_type_plugin(self):
             use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
             use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
             use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
-            # use_ddp_sharded = self.distributed_backend == "ddp_sharded"
-            # use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn"
+            use_ddp_sharded = self.distributed_backend == "ddp_sharded"
+            use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn"
 
-            if self.on_tpu:
-                ddp_plugin_cls = TPUSpawnPlugin
-
-            # ddp script mode uses the same flags as TE
             # TODO: decouple from TE
+            # ddp script mode uses the same flags as TE
             if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                 use_torchelastic_ddp = False
 
-            # fixme
-            # if use_ddp_sharded:
-            #     ddp_plugin_cls = DDPShardedPlugin
-            # elif use_ddp_sharded_spawn:
-            #     ddp_plugin_cls = DDPSpawnShardedPlugin
-            if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
+            if self.on_tpu:
+                ddp_plugin_cls = TPUSpawnPlugin
+            elif use_ddp_sharded:
+                ddp_plugin_cls = DDPShardedPlugin
+            elif use_ddp_sharded_spawn:
+                ddp_plugin_cls = DDPSpawnShardedPlugin
+            elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                 ddp_plugin_cls = DDPPlugin
             elif use_ddp_spawn or use_ddp_cpu_spawn:
                 ddp_plugin_cls = DDPSpawnPlugin

From 6d47357b2f09a2e90184dc5a1ed1b7e0ad85ca9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 2 Feb 2021 05:53:35 +0100
Subject: [PATCH 154/157] set nvidia flags

---
 .../accelerators/accelerator_connector.py         |  3 ---
 pytorch_lightning/accelerators/gpu.py             | 15 ++++++++++++++-
 .../plugins/training_type/training_type_plugin.py | 12 ------------
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 6e3cc9d57b704..43cea74f36ffa 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -123,9 +123,6 @@ def __init__(
         self.interactive_ddp_procs = []
         self.global_rank = 0
 
-        # NVIDIA setup
-        # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
-
         # benchmarking
         # TODO: should this be moved to GPU accelerator?
         torch.backends.cudnn.benchmark = self.benchmark
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 833d5e1cb2a9a..f01cecac1615a 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -1,17 +1,22 @@
+import logging
+import os
+
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
+log = logging.getLogger(__name__)
+
 
 class GPUAccelerator(Accelerator):
 
     def setup(self, trainer, model):
         if "cuda" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
+        self.set_nvidia_flags()
         torch.cuda.set_device(self.root_device)
         model.to(self.root_device)
-
         return super().setup(trainer, model)
 
     def on_train_start(self):
@@ -25,3 +30,11 @@ def on_train_end(self):
         # clean up memory
         with torch.cuda.device(self.root_device):
             torch.cuda.empty_cache()
+
+    @staticmethod
+    def set_nvidia_flags():
+        # set the correct cuda visible devices (using pci order)
+        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
+        devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
+        log.info(f"LOCAL_RANK: {os.getenv('LOCAL_RANK', 0)} - CUDA_VISIBLE_DEVICES: [{devices}]")
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 89f2329512e5e..bda5d161da33b 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -64,18 +64,6 @@ def barrier(self, name: Optional[str] = None) -> None:
     def broadcast(self, obj: object, src: int = 0) -> object:
         """Broadcasts an object to all processes"""
 
-    # TODO method this is currently unused. Check after complete refactors are pushed
-    def set_nvidia_flags(self, is_slurm_managing_tasks: bool, device_ids: Optional[Sequence]) -> None:
-        if device_ids is None:
-            return
-
-        # set the correct cuda visible devices (using pci order)
-        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
-        devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
-        if self.lightning_module is not None:
-            log.info(f"LOCAL_RANK: {self.lightning_module.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
-
     def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
         """Reduce the early stopping decision across all possibly spawned processes"""
         return should_stop

From a6864ec795542965e9efa4415047090f5355d243 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 2 Feb 2021 05:58:26 +0100
Subject: [PATCH 155/157] remove unused colab var

---
 pytorch_lightning/accelerators/accelerator_connector.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 43cea74f36ffa..b86d78e7ee37f 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -135,9 +135,6 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
-        # TODO: move this to TPU accelerator/plugin
-        self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE")
-
         self.replace_sampler_ddp = replace_sampler_ddp
 
     @property

From b4b9724c32bfd4f5d6e46653a3153912467b1f58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 2 Feb 2021 06:06:31 +0100
Subject: [PATCH 156/157] use_tpu <-> on_tpu attrs

---
 pytorch_lightning/accelerators/accelerator_connector.py | 1 -
 pytorch_lightning/trainer/properties.py                 | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index b86d78e7ee37f..01283f2aab14a 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -116,7 +116,6 @@ def __init__(
         # override dist backend when using tpus
         if self.on_tpu:
             self.distributed_backend = "tpu"
-            self.use_tpu = True
 
         # init flags for SLURM+DDP to work
         self.world_size = 1
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 81777530723fe..39dcbc6c7c3e0 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -135,9 +135,7 @@ def use_horovod(self):
 
     @property
     def use_tpu(self):
-        # TODO update this, what is the difference between use_tpu and on_tpu?
-        return False
-        # return self.accelerator_connector.use_tpu
+        return self.accelerator_connector.on_tpu
 
     @property
     def _distrib_type(self):

From 81001e3a3b1e43130a223193c1aa82d552eac02b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 2 Feb 2021 09:42:08 +0100
Subject: [PATCH 157/157] make some ddp_cpu and clusterplugin tests pass

---
 .../accelerators/accelerator_connector.py     | 28 ++++++++++---------
 .../plugins/legacy/plugin_connector.py        | 27 +++++++++---------
 pytorch_lightning/trainer/trainer.py          |  4 +--
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 01283f2aab14a..1fa95ef4c13b5 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -319,6 +319,8 @@ def select_cluster_environment(self):
         return env
 
     def set_distributed_mode(self):
+        if isinstance(self.distributed_backend, Accelerator):
+            return
 
         if self.distributed_backend is None:
             if self.has_horovodrun():
@@ -346,27 +348,27 @@ def set_distributed_mode(self):
         # special case with TPUs
         elif self.distributed_backend == 'tpu':
             self._device_type = DeviceType.TPU
-        # set all other requested distrib. types adn if it was not set in the
+        # set all other requested distrib. types and if it was not set in the
         elif self.distributed_backend and self._distrib_type is None:
             self._distrib_type = DistributedType(self.distributed_backend)
 
         # unless you request explicitly for CPU and some GPU are available use them
         _on_cpu = self.distributed_backend and 'cpu' in self.distributed_backend
-        if (self.num_gpus > 0 and not _on_cpu):
+        if self.num_gpus > 0 and not _on_cpu:
             self._device_type = DeviceType.GPU
 
-        _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
+        # _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
         # DP and DDP2 cannot run without GPU
-        if (self.num_gpus == 0 and self._distrib_type in _distrib_types):
-            rank_zero_warn(
-                'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
-            )
-            # todo: in some cases it yield in comarison None and int
-            if ((self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1)):
-                self._distrib_type = DistributedType.DDP
-            else:
-                rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
-                self._distrib_type = None
+        # if (self.num_gpus == 0 and self._distrib_type in _distrib_types):
+        #     rank_zero_warn(
+        #         'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
+        #     )
+        #     # todo: in some cases it yield in comarison None and int
+        #     if ((self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1)):
+        #         self._distrib_type = DistributedType.DDP
+        #     else:
+        #         rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
+        #         self._distrib_type = None
 
         # for DDP overwrite nb processes by requested GPUs
         if (
diff --git a/pytorch_lightning/plugins/legacy/plugin_connector.py b/pytorch_lightning/plugins/legacy/plugin_connector.py
index 22f97bf8b77f3..95ec73f7dd80e 100644
--- a/pytorch_lightning/plugins/legacy/plugin_connector.py
+++ b/pytorch_lightning/plugins/legacy/plugin_connector.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from enum import Enum
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Sequence
 
+from pytorch_lightning.plugins import Plugin
 from pytorch_lightning.plugins.environments import ClusterEnvironment
 from pytorch_lightning.plugins.legacy.apex import ApexPlugin
 from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
@@ -26,22 +27,22 @@
 
 class PluginConnector:
 
-    def __init__(self, trainer):
+    def __init__(self, trainer, plugins: Optional[Union[str, list]] = None):
         self.trainer = trainer
-        self.plugins = []
-        self.ddp_plugin = DDPPlugin()
+        self.plugins = plugins or []
         self.cloud_environment = None
-
-    def on_trainer_init(self, plugins: Optional[Union[str, list]]):
-        self.plugins = plugins
-        if self.plugins is None:
-            self.plugins = []
+        # self.ddp_plugin = DDPPlugin()
         self.plugins = self._convert_str_custom_plugins(self.plugins)
-        self.plugins = self._append_required_plugins(self.plugins)
-        self.__attach_ddp()
+
+        # TODO: plugin dependencies
+        # self.plugins = self._append_required_plugins(self.plugins)
+
         self.__attach_cluster()
-        self.__attach_amp()
-        self.__attach_apex()
+
+        # TODO: attach custom training type and precision plugins
+        # self.__attach_ddp()
+        # self.__attach_amp()
+        # self.__attach_apex()
 
     def __attach_amp(self):
         amp_plugin = self.__attach_plugin(NativeAMPPlugin)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 9565db4ddf2bc..5cdfa5021acb8 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -306,7 +306,7 @@ def __init__(
         self.config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self)
         self.optimizer_connector = OptimizerConnector(self)
-        self.plugin_connector = PluginConnector(self)
+        self.plugin_connector = PluginConnector(self, plugins)
         self.accelerator_connector = BackendConnector(
             num_processes,
             tpu_cores,
@@ -417,7 +417,7 @@ def __init__(
 
         # last thing are the plugins which override whatever the trainer used by default
         # TODO: probably not needed anymore after refactor
-        self.plugin_connector.on_trainer_init(plugins)
+        # self.plugin_connector.on_trainer_init(plugins)
 
         # Callback system
         self.on_init_end()