base cluster env

awaelchli · awaelchli · commit 8be890989c5f · 2021-02-15T01:39:41.000+01:00
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -72,7 +72,12 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - script: |
-        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50
+        # python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50
+        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests --ignore tests/plugins/test_sharded_plugin.py --ignore tests/trainer/test_dataloaders.py --ignore tests/metrics -v --durations=50
+        # Todo: Find why those tests are failing when run in the main pytest.
+        python -m coverage run -a --source pytorch_lightning -m pytest tests/metrics -v --durations=50
+        python -m coverage run -a --source pytorch_lightning -m pytest tests/plugins/test_sharded_plugin.py tests/trainer/test_dataloaders.py -v --durations=50
+
       displayName: 'Testing: standard'
 
     - script: |
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -30,6 +30,9 @@
 from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available
 from pytorch_lightning.utilities.enums import AMPType, LightningEnum
 
+if TYPE_CHECKING:
+    from pytorch_lightning.trainer.trainer import Trainer
+
 
 class Accelerator(object):
     """
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -428,7 +428,7 @@ def select_accelerator(self) -> Accelerator:
             training_type_plugin=self.training_type_plugin,
         )
 
-    def select_cluster_environment(self) -> ClusterEnvironment:
+    def select_cluster_environment(self):
         if self._cluster_environment is not None:
             return self._cluster_environment
         if self.is_slurm_managing_tasks:
@@ -438,12 +438,8 @@ def select_cluster_environment(self) -> ClusterEnvironment:
             os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
         elif self.is_using_torchelastic:
             env = TorchElasticEnvironment()
-            # TODO: decouple DDP from TE
-            #   refactor and let generic cluster env hold the information about who spawns the processes
-            os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
         else:
-            # TODO: maybe introduce a DefaultEnvironment?
-            env = TorchElasticEnvironment()
+            env = DefaultEnvironment()
         return env
 
     def set_distributed_mode(self, distributed_backend: Optional[str] = None):
diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py
@@ -11,26 +11,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Optional
 
-from pytorch_lightning.plugins.legacy.plugin import LightningPlugin
 
+class ClusterEnvironment(ABC):
+    """ Specification of a cluster environment. """
 
-class ClusterEnvironment(LightningPlugin):
+    @abstractmethod
+    def spawns_children(self) -> bool:
+        """ Whether the environment spawns the subprocesses or not. """
 
-    def __init__(self):
-        self._world_size = None
+    @abstractmethod
+    def master_address(self) -> str:
+        """ The master address through which all processes connect and communicate. """
 
-    def master_address(self):
-        pass
-
-    def master_port(self):
-        pass
+    @abstractmethod
+    def master_port(self) -> int:
+        """ An open and configured port in the master node through which all processes communicate. """
 
-    def world_size(self) -> int:
-        return self._world_size
+    @abstractmethod
+    def world_size(self) -> Optional[int]:
+        """ The number of processes across all devices and nodes. """
 
+    @abstractmethod
     def local_rank(self) -> int:
-        pass
+        """ The rank (index) of the currently running process inside of the current node. """
+
+    @abstractmethod
+    def node_rank(self) -> int:
+        """ The rank (index) of the node on which the current process runs. """
 
     def node_rank(self) -> int:
         pass
diff --git a/pytorch_lightning/plugins/environments/default_environment.py b/pytorch_lightning/plugins/environments/default_environment.py
@@ -0,0 +1,55 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Optional
+
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.utilities.distributed import find_free_network_port
+
+
+class DefaultEnvironment(ClusterEnvironment):
+    """
+    A default environment for a single node or free cluster (not managed).
+
+    The master process must be launched by the user and Lightning will spawn new
+    worker processes for distributed training, either in a single node or across multiple nodes.
+
+    If the master address and port are not provided, the default environment will choose them
+    automatically. It is recommended to use this default environment for single-node distributed
+    training as it provides the most convenient way to launch the training script.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._world_size = None
+
+    def spawns_children(self) -> bool:
+        return False
+
+    def master_address(self) -> str:
+        return os.environ.get("MASTER_ADDR", "127.0.0.1")
+
+    def master_port(self) -> int:
+        return int(os.environ.get("MASTER_PORT", find_free_network_port()))
+
+    def world_size(self) -> Optional[int]:
+        return None
+
+    def local_rank(self) -> int:
+        return int(os.environ.get("LOCAL_RANK", 0))
+
+    def node_rank(self) -> int:
+        group_rank = os.environ.get("GROUP_RANK", 0)
+        return int(os.environ.get("NODE_RANK", group_rank))
diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py
@@ -24,6 +24,9 @@ class SLURMEnvironment(ClusterEnvironment):
     def __init__(self):
         super().__init__()
 
+    def spawns_children(self) -> bool:
+        return True
+
     def master_address(self):
         # figure out the root node addr
         slurm_nodelist = os.environ.get("SLURM_NODELIST")
@@ -65,7 +68,7 @@ def master_port(self):
         return default_port
 
     def world_size(self):
-        return self._world_size
+        return None
 
     def local_rank(self):
         return int(os.environ['SLURM_LOCALID'])
diff --git a/pytorch_lightning/plugins/environments/torchelastic_environment.py b/pytorch_lightning/plugins/environments/torchelastic_environment.py
@@ -24,6 +24,9 @@ class TorchElasticEnvironment(ClusterEnvironment):
     def __init__(self):
         super().__init__()
 
+    def spawns_children(self) -> bool:
+        return True
+
     def master_address(self):
         if "MASTER_ADDR" not in os.environ:
             rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost")
diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py
@@ -92,6 +92,15 @@ def backward(
         closure_loss = closure_loss.detach()
         return closure_loss
 
+    def pre_optimizer_step(
+        self, pl_module: LightningModule, optimizer: Optimizer, optimizer_idx: int, closure: Callable, **kwargs
+    ) -> bool:
+        """Hook to do something before each optimizer step."""
+        # Apex: Amp does not support closure use with optimizers
+        closure()
+        optimizer.step()
+        return False
+
     def configure_apex(
         self,
         amp: object,
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
@@ -27,10 +27,9 @@
 from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
-from pytorch_lightning.plugins.environments import SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
-from pytorch_lightning.utilities import _HYDRA_AVAILABLE, _TORCH_GREATER_EQUAL_1_7, rank_zero_warn
+from pytorch_lightning.utilities import _HYDRA_AVAILABLE, _PYTORCH_GREATER_EQUAL_1_7_0, rank_zero_warn
 from pytorch_lightning.utilities.distributed import (
     find_free_network_port,
     rank_zero_only,
@@ -88,8 +87,7 @@ def setup(self, model):
         self._model = model
 
         # start the other scripts
-        # TODO: refactor and let generic cluster env hold the information about who spawns the processes
-        if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
+        if not self.cluster_environment.spawns_children() and os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
             self._call_children_scripts()
 
         # set the task idx
@@ -103,15 +101,12 @@ def _call_children_scripts(self):
         self._has_spawned_children = True
 
         # DDP Environment variables
-        os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
-        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+        os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
 
         # allow the user to pass the node rank
-        node_rank = "0"
-        node_rank = os.environ.get("NODE_RANK", node_rank)
-        node_rank = os.environ.get("GROUP_RANK", node_rank)
-        os.environ["NODE_RANK"] = node_rank
-        os.environ["LOCAL_RANK"] = "0"
+        os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank())
+        os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank())
 
         # when user is using hydra find the absolute path
         path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path
@@ -205,7 +200,6 @@ def determine_ddp_device_ids(self):
         return [self.root_device.index]
 
     def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
-        # TODO: From where to get cluster environment?
         os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
         os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size())
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -82,7 +82,7 @@ def distributed_sampler_kwargs(self):
     def setup(self, model):
         self._model = model
 
-        os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port()))
+        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
 
         # pass in a state q
         smp = mp.get_context("spawn")
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
@@ -35,6 +35,8 @@
     _module_available,
     _NATIVE_AMP_AVAILABLE,
     _OMEGACONF_AVAILABLE,
+    _PYTORCH_GREATER_EQUAL_1_7_0,
+    _PYTORCH_PRUNE_AVAILABLE,
     _RPC_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_6,
     _TORCH_GREATER_EQUAL_1_7,
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
@@ -25,6 +25,7 @@
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin, DDPSpawnPlugin, PrecisionPlugin, SingleDevicePlugin
 from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
+from pytorch_lightning.plugins.environments.default_environment import DefaultEnvironment
 from tests.helpers.boring_model import BoringModel
 
 
@@ -44,7 +45,7 @@ def test_accelerator_choice_ddp_cpu(tmpdir):
     )
     assert isinstance(trainer.accelerator_backend, CPUAccelerator)
     assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
-    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, DefaultEnvironment)
 
 
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
@@ -58,7 +59,7 @@ def test_accelerator_choice_ddp(cuda_available_mock, device_count_mock):
     )
     assert isinstance(trainer.accelerator_backend, GPUAccelerator)
     assert isinstance(trainer.training_type_plugin, DDPPlugin)
-    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, DefaultEnvironment)
 
 
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
@@ -72,7 +73,7 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock):
     )
     assert isinstance(trainer.accelerator_backend, GPUAccelerator)
     assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
-    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, DefaultEnvironment)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -286,7 +287,7 @@ def test_accelerator_choice_ddp_cpu_custom_cluster(device_count_mock):
     Test that we choose the custom cluster even when SLURM or TE flags are around
     """
 
-    class CustomCluster(ClusterEnvironment):
+    class CustomCluster(DefaultEnvironment):
 
         def master_address(self):
             return 'asdf'
diff --git a/tests/accelerators/test_ddp_spawn.py b/tests/accelerators/test_ddp_spawn.py
@@ -14,9 +14,9 @@
 import pytest
 import torch
 
+from pytorch_lightning.callbacks import EarlyStopping
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
-from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.core import memory
 from pytorch_lightning.trainer import Trainer
 from pytorch_lightning.trainer.states import TrainerState
diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
@@ -735,6 +735,8 @@ def __len__(self):
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='Test requires multiple GPUs')
 def test_dataloader_reinit_for_subclass(tmpdir):
 
+    del os.environ["PL_TRAINER_GPUS"]
+
     class CustomDataLoader(torch.utils.data.DataLoader):
 
         def __init__(