Lightning-AI
diff --git a/‎.drone.yml‎
Lines changed: 3 additions & 1 deletion b/‎.drone.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pytorch_lightning/accelerators/accelerator.py‎
Lines changed: 13 additions & 0 deletions b/‎pytorch_lightning/accelerators/accelerator.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎pytorch_lightning/accelerators/accelerator_connector.py‎
100644100755
Lines changed: 22 additions & 9 deletions b/‎pytorch_lightning/accelerators/accelerator_connector.py‎
100644100755
Lines changed: 22 additions & 9 deletions
diff --git a/‎pytorch_lightning/accelerators/tpu.py‎
Lines changed: 14 additions & 1 deletion b/‎pytorch_lightning/accelerators/tpu.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_lightning/plugins/precision/apex_amp.py‎
Lines changed: 14 additions & 2 deletions b/‎pytorch_lightning/plugins/precision/apex_amp.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_lightning/plugins/training_type/ddp.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp_spawn.py‎
Lines changed: 3 additions & 2 deletions b/‎pytorch_lightning/plugins/training_type/ddp_spawn.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/parallel.py‎
Lines changed: 8 additions & 1 deletion b/‎pytorch_lightning/plugins/training_type/parallel.py‎
Lines changed: 8 additions & 1 deletion
@@ -47,7 +47,9 @@ steps:
     - unzip -o legacy/checkpoints.zip -d legacy/
     - ls -l legacy/checkpoints/
     # testing...
-    - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=25 # --flake8
+    - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests --ignore tests/plugins/test_sharded_plugin.py --ignore tests/trainer/test_dataloaders.py -v --durations=25 # --flake8
+    # Todo: Find why those tests are failing when run in the main pytest.
+    - python -m coverage run -a --source pytorch_lightning -m pytest tests/plugins/test_sharded_plugin.py tests/trainer/test_dataloaders.py -v --durations=25 # --flake8
     # Running special tests
     - sh tests/special_tests.sh
     - coverage report
 
@@ -151,3 +151,4 @@ wandb
 
 # dataset generated from bolts in examples.
 cifar-10-batches-py
+*.pt
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available
 from typing import Any, Callable, Iterable, Optional, TYPE_CHECKING, Union
 
 import torch
@@ -374,3 +375,15 @@ def on_save(self, checkpoint):
 
     def barrier(self, name: Optional[str] = None) -> None:
         self.training_type_plugin.barrier(name=name)
+
+    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+        """
+        Function to gather a tensor from several distributed processes
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: the process group to gather results from. Defaults to all processes (world)
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads)
@@ -33,7 +33,6 @@
     HorovodPlugin,
     NativeMixedPrecisionPlugin,
     PrecisionPlugin,
-    RPCPlugin,
     ShardedNativeMixedPrecisionPlugin,
     SingleDevicePlugin,
     SingleTPUPlugin,
@@ -116,11 +115,11 @@ def __init__(
         self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)
         self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_device_ids)
 
-        self.handle_given_plugins(plugins)
-
         self.set_distributed_mode()
         self.configure_slurm_ddp()
 
+        self.handle_given_plugins(plugins)
+
         self.accelerator = self.select_accelerator()
 
         # override dist backend when using tpus
@@ -147,8 +146,10 @@ def __init__(
         self.replace_sampler_ddp = replace_sampler_ddp
 
     def handle_given_plugins(self, plugins: Optional[Sequence]):
-        if plugins is None:
-            return
+        plugins = plugins if plugins is not None else []
+
+        if isinstance(plugins, str):
+            plugins = [plugins]
 
         if not isinstance(plugins, Sequence):
             plugins = [plugins]
@@ -158,7 +159,10 @@ def handle_given_plugins(self, plugins: Optional[Sequence]):
         cluster_environment = None
 
         for plug in plugins:
-            if isinstance(plug, TrainingTypePlugin):
+            if isinstance(plug, str):
+                self.set_distributed_mode(plug)
+
+            elif isinstance(plug, TrainingTypePlugin):
                 if training_type is None:
                     training_type = plug
 
@@ -191,6 +195,7 @@ def handle_given_plugins(self, plugins: Optional[Sequence]):
                 )
 
         self._training_type_plugin = training_type
+        self._training_type_plugin = self.training_type_plugin
         self._precision_plugin = precision
         self._cluster_environment = cluster_environment or self.select_cluster_environment()
 
@@ -206,6 +211,7 @@ def training_type_plugin(self) -> TrainingTypePlugin:
             self._training_type_plugin = self.select_training_type_plugin()
         else:
             self._training_type_plugin = self.resolve_training_type_plugin(self._training_type_plugin)
+
         return self._training_type_plugin
 
     @property
@@ -327,7 +333,7 @@ def select_precision_plugin(self):
 
     def select_training_type_plugin(self):
         if self.use_ddp2:
-            plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self._cluster_environment)
+            plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment)
         elif self.use_ddp:
             use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
             use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic
@@ -359,7 +365,7 @@ def select_training_type_plugin(self):
             plugin = ddp_plugin_cls(
                 parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
-                cluster_environment=self.select_cluster_environment(),
+                cluster_environment=self.cluster_environment,
                 sync_batchnorm=self.sync_batchnorm,
             )
         elif self.use_dp:
@@ -425,7 +431,11 @@ def select_cluster_environment(self):
             env = TorchElasticEnvironment()
         return env
 
-    def set_distributed_mode(self):
+    def set_distributed_mode(self, distributed_backend: Optional[str] = None):
+
+        if distributed_backend is not None:
+            self.distributed_backend = distributed_backend
+
         if isinstance(self.distributed_backend, Accelerator):
             return
 
@@ -484,6 +494,9 @@ def set_distributed_mode(self):
         ):
             self.num_processes = self.num_gpus
 
+        if (self._device_type == DeviceType.GPU and self._distrib_type == DistributedType.DDP2):
+            self.num_processes = self.num_nodes
+
         # Horovod is an extra case...
         if self.distributed_backend == "horovod":
             self._set_horovod_backend()
 
@@ -1,4 +1,5 @@
-from typing import Callable
+from typing import Any, Callable, Optional, Union
+import torch
 
 from torch.optim import Optimizer
 
@@ -28,3 +29,15 @@ def setup(self, trainer, model):
 
     def run_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs):
         xm.optimizer_step(optimizer, optimizer_args={'closure': lambda_closure, **kwargs})
+
+    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+        """
+        Function to gather a tensor from several distributed processes
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: the process group to gather results from. Defaults to all processes (world)
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        return xm.all_gather(tensor, group=group, sync_grads=sync_grads)
@@ -540,9 +540,9 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
 
         accelerator_backend = trainer.accelerator_backend
 
-        if accelerator_backend is not None and accelerator_backend.rpc_enabled:
+        if accelerator_backend.training_type_plugin.rpc_enabled:
             # RPCPlugin manages saving all model states
-            accelerator_backend.ddp_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module)
+            accelerator_backend.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module)
         else:
             self._save_model(last_filepath, trainer, pl_module)
         if (
 
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, List, Tuple
+from typing import List, Tuple, Callable
 
 import torch
 from torch.optim import Optimizer
@@ -38,6 +38,8 @@ def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
         """Connects the precision plugin to the training process,
         configures apex and reinits the schedulers
         """
+        if model.device.type != "cuda":
+            return model, optimizers, lr_schedulers
         model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level)
         self.reinit_scheduler_properties(optimizers, lr_schedulers)
         return model, optimizers, lr_schedulers
@@ -71,7 +73,7 @@ def backward(
         # do backward pass
         # TODO: not entirely sure, why we need this
         if model is not None and isinstance(model, LightningModule):
-            model.backward(closure_loss, optimizer, opt_idx)
+            model.backward(closure_loss, optimizer, opt_idx, **kwargs)
 
             # TODO: avoid dev_debugger and track these calls with mock
             model.trainer.dev_debugger.track_event('AMP', str(AMPType.APEX))
@@ -90,6 +92,16 @@ def backward(
         closure_loss = closure_loss.detach()
         return closure_loss
 
+    def pre_optimizer_step(
+        self, pl_module: LightningModule, optimizer: Optimizer, optimizer_idx: int, closure: Callable, **kwargs
+    ) -> bool:
+        """Hook to do something before each optimizer step."""
+        # Apex: Amp does not support closure use with optimizers
+        closure()
+        optimizer.step()
+        return False
+
+
     def configure_apex(
         self,
         amp: object,
 
@@ -29,7 +29,7 @@
 from pytorch_lightning.overrides.distributed import prepare_for_backward
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
-from pytorch_lightning.utilities import _HYDRA_AVAILABLE, _PYTORCH_GREATER_EQUAL_THAN_1_7_0, rank_zero_warn
+from pytorch_lightning.utilities import _HYDRA_AVAILABLE, _PYTORCH_GREATER_EQUAL_1_7_0, rank_zero_warn
 from pytorch_lightning.utilities.distributed import (
     find_free_network_port,
     rank_zero_only,
@@ -181,7 +181,7 @@ def set_world_ranks(self):
 
     def pre_configure_ddp(self):
         # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()``` breaking manual_optimization
-        if _PYTORCH_GREATER_EQUAL_THAN_1_7_0 and not self.lightning_module.automatic_optimization:
+        if _PYTORCH_GREATER_EQUAL_1_7_0 and not self.lightning_module.automatic_optimization:
             rank_zero_warn(
                 "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` "
                 "to properly work with DDP."
 
@@ -27,7 +27,7 @@
 from pytorch_lightning.overrides.distributed import prepare_for_backward
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
-from pytorch_lightning.utilities import _PYTORCH_GREATER_EQUAL_THAN_1_7_0
+from pytorch_lightning.utilities import _PYTORCH_GREATER_EQUAL_1_7_0
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.distributed import (
@@ -91,6 +91,7 @@ def setup(self, model):
     def set_world_ranks(self, process_idx):
         self.local_rank = process_idx
         self.node_rank = self.cluster_environment.node_rank()
+        self.task_idx = self.cluster_local_rank
         self.global_rank = self.node_rank * self.num_processes + self.local_rank
         self.world_size = self.num_nodes * self.num_processes
 
@@ -164,7 +165,7 @@ def post_training(self):
 
     def pre_configure_ddp(self):
         # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()``` breaking manual_optimization
-        if _PYTORCH_GREATER_EQUAL_THAN_1_7_0 and not self.lightning_module.automatic_optimization:
+        if _PYTORCH_GREATER_EQUAL_1_7_0 and not self.lightning_module.automatic_optimization:
             rank_zero_warn(
                 "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` "
                 "to properly work with DDP."
 
@@ -36,10 +36,17 @@ def __init__(
     ):
         super().__init__()
         self.parallel_devices = parallel_devices
-        self.local_rank = 0
         self.world_size = 1
+        self.local_rank = 0
         self.cluster_environment = cluster_environment
 
+    @property
+    def cluster_local_rank(self):
+        try:
+            return self.cluster_environment.local_rank()
+        except KeyError:
+            return 0
+
     @property
     @abstractmethod
     def root_device(self):
Original file line number	Diff line number	Diff line change
`@@ -151,3 +151,4 @@ wandb`
`151`	`151`
`152`	`152`	`# dataset generated from bolts in examples.`
`153`	`153`	`cifar-10-batches-py`
	`154`	`+*.pt`