Lightning-AI
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pytorch_lightning/plugins/environments/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pytorch_lightning/plugins/environments/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pytorch_lightning/plugins/environments/cluster_environment.py‎
Lines changed: 20 additions & 11 deletions b/‎pytorch_lightning/plugins/environments/cluster_environment.py‎
Lines changed: 20 additions & 11 deletions
diff --git a/‎pytorch_lightning/plugins/environments/lightning_environment.py‎
Lines changed: 71 additions & 0 deletions b/‎pytorch_lightning/plugins/environments/lightning_environment.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎pytorch_lightning/plugins/environments/slurm_environment.py‎
Lines changed: 10 additions & 7 deletions b/‎pytorch_lightning/plugins/environments/slurm_environment.py‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎pytorch_lightning/plugins/environments/torchelastic_environment.py‎
Lines changed: 11 additions & 6 deletions b/‎pytorch_lightning/plugins/environments/torchelastic_environment.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp.py‎
Lines changed: 6 additions & 17 deletions b/‎pytorch_lightning/plugins/training_type/ddp.py‎
Lines changed: 6 additions & 17 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp_spawn.py‎
Lines changed: 3 additions & 9 deletions b/‎pytorch_lightning/plugins/training_type/ddp_spawn.py‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/parallel.py‎
Lines changed: 0 additions & 7 deletions b/‎pytorch_lightning/plugins/training_type/parallel.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎pytorch_lightning/trainer/connectors/accelerator_connector.py‎
Lines changed: 7 additions & 9 deletions b/‎pytorch_lightning/trainer/connectors/accelerator_connector.py‎
Lines changed: 7 additions & 9 deletions
@@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072))
 
 
+- Added `LightningEnvironment` for Lightning-specific DDP ([#5915](https://github.com/PyTorchLightning/pytorch-lightning/pull/5915))
+
+
 - Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/PyTorchLightning/pytorch-lightning/pull/6274))
 
 
 
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment  # noqa: F401
+from pytorch_lightning.plugins.environments.lightning_environment import LightningEnvironment  # noqa: F401
 from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment  # noqa: F401
 from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment  # noqa: F401
@@ -11,24 +11,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Optional
 
 
-class ClusterEnvironment:
+class ClusterEnvironment(ABC):
+    """ Specification of a cluster environment. """
 
-    def __init__(self):
-        self._world_size = None
+    @abstractmethod
+    def creates_children(self) -> bool:
+        """ Whether the environment creates the subprocesses or not. """
 
-    def master_address(self):
-        pass
+    @abstractmethod
+    def master_address(self) -> str:
+        """ The master address through which all processes connect and communicate. """
 
-    def master_port(self):
-        pass
+    @abstractmethod
+    def master_port(self) -> int:
+        """ An open and configured port in the master node through which all processes communicate. """
 
-    def world_size(self) -> int:
-        return self._world_size
+    @abstractmethod
+    def world_size(self) -> Optional[int]:
+        """ The number of processes across all devices and nodes. """
 
+    @abstractmethod
     def local_rank(self) -> int:
-        pass
+        """ The rank (index) of the currently running process inside of the current node. """
 
+    @abstractmethod
     def node_rank(self) -> int:
-        pass
+        """ The rank (index) of the node on which the current process runs. """
@@ -0,0 +1,71 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+from typing import Optional
+
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
+
+
+class LightningEnvironment(ClusterEnvironment):
+    """
+    The default environment used by Lightning for a single node or free cluster (not managed).
+
+    The master process must be launched by the user and Lightning will spawn new
+    worker processes for distributed training, either in a single node or across multiple nodes.
+
+    If the master address and port are not provided, the default environment will choose them
+    automatically. It is recommended to use this default environment for single-node distributed
+    training as it provides the most convenient way to launch the training script.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._master_port = None
+
+    def creates_children(self) -> bool:
+        return False
+
+    def master_address(self) -> str:
+        return os.environ.get("MASTER_ADDR", "127.0.0.1")
+
+    def master_port(self) -> int:
+        if self._master_port is None:
+            self._master_port = os.environ.get("MASTER_PORT", find_free_network_port())
+        return int(self._master_port)
+
+    def world_size(self) -> Optional[int]:
+        return None
+
+    def local_rank(self) -> int:
+        return int(os.environ.get("LOCAL_RANK", 0))
+
+    def node_rank(self) -> int:
+        group_rank = os.environ.get("GROUP_RANK", 0)
+        return int(os.environ.get("NODE_RANK", group_rank))
+
+
+def find_free_network_port() -> int:
+    """
+    Finds a free port on localhost.
+    It is useful in single-node training when we don't want to connect to a real master node but
+    have to set the `MASTER_PORT` environment variable.
+    """
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("", 0))
+    s.listen(1)
+    port = s.getsockname()[1]
+    s.close()
+    return port
@@ -26,7 +26,10 @@ class SLURMEnvironment(ClusterEnvironment):
     def __init__(self):
         super().__init__()
 
-    def master_address(self):
+    def creates_children(self) -> bool:
+        return True
+
+    def master_address(self) -> str:
         # figure out the root node addr
         slurm_nodelist = os.environ.get("SLURM_NODELIST")
         if slurm_nodelist:
@@ -39,7 +42,7 @@ def master_address(self):
         log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
         return root_node
 
-    def master_port(self):
+    def master_port(self) -> int:
         # -----------------------
         # SLURM JOB = PORT number
         # -----------------------
@@ -64,18 +67,18 @@ def master_port(self):
 
         log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
 
-        return default_port
+        return int(default_port)
 
     def world_size(self):
-        return self._world_size
+        return None
 
-    def local_rank(self):
+    def local_rank(self) -> int:
         return int(os.environ['SLURM_LOCALID'])
 
-    def node_rank(self):
+    def node_rank(self) -> int:
         return int(os.environ['SLURM_NODEID'])
 
-    def resolve_root_node_address(self, root_node):
+    def resolve_root_node_address(self, root_node: str) -> str:
         if '[' in root_node:
             name, numbers = root_node.split('[', maxsplit=1)
             number = numbers.split(',', maxsplit=1)[0]
 
@@ -14,6 +14,7 @@
 
 import logging
 import os
+from typing import Optional
 
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.utilities import rank_zero_warn
@@ -26,27 +27,31 @@ class TorchElasticEnvironment(ClusterEnvironment):
     def __init__(self):
         super().__init__()
 
-    def master_address(self):
+    def creates_children(self) -> bool:
+        return True
+
+    def master_address(self) -> str:
         if "MASTER_ADDR" not in os.environ:
             rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost")
             os.environ["MASTER_ADDR"] = "127.0.0.1"
         log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
         master_address = os.environ.get('MASTER_ADDR')
         return master_address
 
-    def master_port(self):
+    def master_port(self) -> int:
         if "MASTER_PORT" not in os.environ:
             rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910")
             os.environ["MASTER_PORT"] = "12910"
         log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
 
-        port = os.environ.get('MASTER_PORT')
+        port = int(os.environ.get('MASTER_PORT'))
         return port
 
-    def world_size(self):
-        return os.environ.get('WORLD_SIZE')
+    def world_size(self) -> Optional[int]:
+        world_size = os.environ.get('WORLD_SIZE')
+        return int(world_size) if world_size is not None else world_size
 
-    def local_rank(self):
+    def local_rank(self) -> int:
         return int(os.environ['LOCAL_RANK'])
 
     def node_rank(self) -> int:
 
@@ -30,20 +30,14 @@
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE, _TORCH_GREATER_EQUAL_1_7, rank_zero_warn
-from pytorch_lightning.utilities.distributed import (
-    find_free_network_port,
-    rank_zero_only,
-    ReduceOp,
-    sync_ddp_if_available,
-)
+from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
 
 if _HYDRA_AVAILABLE:
     from hydra.core.hydra_config import HydraConfig
     from hydra.utils import get_original_cwd, to_absolute_path
 
-
 log = logging.getLogger(__name__)
 
 
@@ -90,8 +84,7 @@ def setup(self, model):
         self._model = model
 
         # start the other scripts
-        # TODO: refactor and let generic cluster env hold the information about who spawns the processes
-        if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
+        if not self.cluster_environment.creates_children() and os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
             self._call_children_scripts()
 
         # set the task idx
@@ -105,15 +98,12 @@ def _call_children_scripts(self):
         self._has_spawned_children = True
 
         # DDP Environment variables
-        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
-        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+        os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
 
         # allow the user to pass the node rank
-        node_rank = "0"
-        node_rank = os.environ.get("NODE_RANK", node_rank)
-        node_rank = os.environ.get("GROUP_RANK", node_rank)
-        os.environ["NODE_RANK"] = node_rank
-        os.environ["LOCAL_RANK"] = "0"
+        os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank())
+        os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank())
 
         # when user is using hydra find the absolute path
         path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path
@@ -209,7 +199,6 @@ def determine_ddp_device_ids(self):
         return [self.root_device.index]
 
     def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
-        # TODO: From where to get cluster environment?
         os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
         os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
 
@@ -30,13 +30,7 @@
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
-from pytorch_lightning.utilities.distributed import (
-    find_free_network_port,
-    rank_zero_only,
-    rank_zero_warn,
-    ReduceOp,
-    sync_ddp_if_available,
-)
+from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, ReduceOp, sync_ddp_if_available
 from pytorch_lightning.utilities.seed import seed_everything
 
 log = logging.getLogger(__name__)
@@ -84,7 +78,7 @@ def distributed_sampler_kwargs(self):
     def setup(self, model):
         self._model = model
 
-        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
 
         # pass in a state q
         smp = mp.get_context("spawn")
@@ -93,7 +87,7 @@ def setup(self, model):
     def set_world_ranks(self, process_idx):
         self.local_rank = process_idx
         self.node_rank = self.cluster_environment.node_rank()
-        self.task_idx = self.cluster_local_rank
+        self.task_idx = self.cluster_environment.local_rank()
         self.global_rank = self.node_rank * self.num_processes + self.local_rank
         self.world_size = self.num_nodes * self.num_processes
 
 
@@ -40,13 +40,6 @@ def __init__(
         self.local_rank = 0
         self.cluster_environment = cluster_environment
 
-    @property
-    def cluster_local_rank(self):
-        try:
-            return self.cluster_environment.local_rank()
-        except KeyError:
-            return 0
-
     @property
     @abstractmethod
     def root_device(self):
 
@@ -42,7 +42,12 @@
     TPUSpawnPlugin,
     TrainingTypePlugin,
 )
-from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
+from pytorch_lightning.plugins.environments import (
+    ClusterEnvironment,
+    LightningEnvironment,
+    SLURMEnvironment,
+    TorchElasticEnvironment,
+)
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
@@ -451,17 +456,10 @@ def select_cluster_environment(self) -> ClusterEnvironment:
             return self._cluster_environment
         if self.is_slurm_managing_tasks:
             env = SLURMEnvironment()
-            # TODO: decouple DDP from SLURM
-            #   refactor and let generic cluster env hold the information about who spawns the processes
-            os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
         elif self.is_using_torchelastic:
             env = TorchElasticEnvironment()
-            # TODO: decouple DDP from TE
-            #   refactor and let generic cluster env hold the information about who spawns the processes
-            os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
         else:
-            # TODO: maybe introduce a DefaultEnvironment?
-            env = TorchElasticEnvironment()
+            env = LightningEnvironment()
         return env
 
     def set_distributed_mode(self, distributed_backend: Optional[str] = None):