From 374e2ce442b1490a657886c056738eb3447b440a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 24 Oct 2021 17:50:36 +0200 Subject: [PATCH 01/18] rename occurrences of master port, master address, maser node, master process --- docs/source/clouds/cluster.rst | 4 +-- docs/source/common/trainer.rst | 4 +-- pytorch_lightning/accelerators/accelerator.py | 2 +- .../environments/cluster_environment.py | 8 ++--- .../environments/kubeflow_environment.py | 4 +-- .../environments/lightning_environment.py | 16 ++++----- .../plugins/environments/lsf_environment.py | 34 +++++++++---------- .../plugins/environments/slurm_environment.py | 4 +-- .../environments/torchelastic_environment.py | 4 +-- .../plugins/training_type/ddp.py | 4 +-- .../plugins/training_type/ddp_spawn.py | 4 +-- .../plugins/training_type/deepspeed.py | 6 ++-- .../plugins/training_type/tpu_spawn.py | 2 +- .../training_type/training_type_plugin.py | 2 +- pytorch_lightning/utilities/distributed.py | 6 ++-- .../test_accelerator_connector.py | 2 +- tests/models/test_tpu.py | 2 +- .../environments/test_kubeflow_environment.py | 8 ++--- .../test_lightning_environment.py | 14 ++++---- .../environments/test_lsf_environment.py | 8 ++--- .../environments/test_slurm_environment.py | 12 +++---- .../test_torchelastic_environment.py | 8 ++--- tests/plugins/test_deepspeed_plugin.py | 4 +-- 23 files changed, 81 insertions(+), 81 deletions(-) diff --git a/docs/source/clouds/cluster.rst b/docs/source/clouds/cluster.rst index a4a1524c05734..40eaaab1fc11a 100644 --- a/docs/source/clouds/cluster.rst +++ b/docs/source/clouds/cluster.rst @@ -306,10 +306,10 @@ and node rank (node id). Here is an example of a custom def node_rank(self) -> int: return int(os.environ["NODE_RANK"]) - def master_address(self) -> str: + def main_address(self) -> str: return os.environ["MASTER_ADDRESS"] - def master_port(self) -> int: + def main_port(self) -> int: return int(os.environ["MASTER_PORT"]) diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index ef04d27136eef..2934e980501d6 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1077,10 +1077,10 @@ To define your own behavior, subclass the relevant class and pass it in. Here's class MyCluster(ClusterEnvironment): - def master_address(self): + def main_address(self): return your_master_address - def master_port(self): + def main_port(self): return your_master_port def world_size(self): diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 28145db13be00..f473cf9c79aef 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -492,7 +492,7 @@ def results(self) -> Any: This property is deprecated in v1.5 and will be removed in v1.6. Please call `training_type_plugin.results` directly. - In distributed training, we make sure to transfer the results to the appropriate master process. + In distributed training, we make sure to transfer the results to the appropriate main process. """ rank_zero_deprecation( "`Accelerator.results` is deprecated in v1.5 and will be removed in v1.6. " diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 251e8ba16f6e7..43d86feb2745e 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -22,12 +22,12 @@ def creates_children(self) -> bool: """Whether the environment creates the subprocesses or not.""" @abstractmethod - def master_address(self) -> str: - """The master address through which all processes connect and communicate.""" + def main_address(self) -> str: + """The main address through which all processes connect and communicate.""" @abstractmethod - def master_port(self) -> int: - """An open and configured port in the master node through which all processes communicate.""" + def main_port(self) -> int: + """An open and configured port in the main node through which all processes communicate.""" @abstractmethod def world_size(self) -> int: diff --git a/pytorch_lightning/plugins/environments/kubeflow_environment.py b/pytorch_lightning/plugins/environments/kubeflow_environment.py index 10a020d35a529..3f8f2c9b7af65 100644 --- a/pytorch_lightning/plugins/environments/kubeflow_environment.py +++ b/pytorch_lightning/plugins/environments/kubeflow_environment.py @@ -38,10 +38,10 @@ def is_using_kubeflow() -> bool: def creates_children(self) -> bool: return True - def master_address(self) -> str: + def main_address(self) -> str: return os.environ["MASTER_ADDR"] - def master_port(self) -> int: + def main_port(self) -> int: return int(os.environ["MASTER_PORT"]) def world_size(self) -> int: diff --git a/pytorch_lightning/plugins/environments/lightning_environment.py b/pytorch_lightning/plugins/environments/lightning_environment.py index b3558c23d6b94..071b89cf46433 100644 --- a/pytorch_lightning/plugins/environments/lightning_environment.py +++ b/pytorch_lightning/plugins/environments/lightning_environment.py @@ -29,14 +29,14 @@ class LightningEnvironment(ClusterEnvironment): 2. The user launches all processes manually or with utilities like :code:`torch.distributed.launch`. The appropriate environment variables need to be set, and at minimum :code:`LOCAL_RANK`. - If the master address and port are not provided, the default environment will choose them + If the main address and port are not provided, the default environment will choose them automatically. It is recommended to use this default environment for single-node distributed training as it provides a convenient way to launch the training script. """ def __init__(self): super().__init__() - self._master_port = None + self._main_port = None self._global_rank: int = 0 self._world_size: int = 1 @@ -48,13 +48,13 @@ def creates_children(self) -> bool: """ return "LOCAL_RANK" in os.environ - def master_address(self) -> str: + def main_address(self) -> str: return os.environ.get("MASTER_ADDR", "127.0.0.1") - def master_port(self) -> int: - if self._master_port is None: - self._master_port = os.environ.get("MASTER_PORT", find_free_network_port()) - return int(self._master_port) + def main_port(self) -> int: + if self._main_port is None: + self._main_port = os.environ.get("MASTER_PORT", find_free_network_port()) + return int(self._main_port) def world_size(self) -> int: return self._world_size @@ -84,7 +84,7 @@ def teardown(self) -> None: def find_free_network_port() -> int: """Finds a free port on localhost. - It is useful in single-node training when we don't want to connect to a real master node but have to set the + It is useful in single-node training when we don't want to connect to a real main node but have to set the `MASTER_PORT` environment variable. """ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) diff --git a/pytorch_lightning/plugins/environments/lsf_environment.py b/pytorch_lightning/plugins/environments/lsf_environment.py index af6bfbb8163c9..368c6f067e150 100644 --- a/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/pytorch_lightning/plugins/environments/lsf_environment.py @@ -41,10 +41,10 @@ class LSFEnvironment(ClusterEnvironment): """ def __init__(self): - self._master_address = self._get_master_address() - self._master_port = self._get_master_port() - log.debug(f"MASTER_ADDR: {self._master_address}") - log.debug(f"MASTER_PORT: {self._master_port}") + self._main_address = self._get_main_address() + self._main_port = self._get_main_port() + log.debug(f"MASTER_ADDR: {self._main_address}") + log.debug(f"MASTER_PORT: {self._main_port}") @staticmethod def is_using_lsf() -> bool: @@ -55,13 +55,13 @@ def is_using_lsf() -> bool: def creates_children(self) -> bool: return True - def master_address(self): - """The master address is read from a list of hosts contained in the environment variable `LSB_HOSTS`.""" - return self._master_address + def main_address(self): + """The main address is read from a list of hosts contained in the environment variable `LSB_HOSTS`.""" + return self._main_address - def master_port(self): - """THe master port gets calculated from the LSF job ID.""" - return self._master_port + def main_port(self): + """THe main port gets calculated from the LSF job ID.""" + return self._main_port def world_size(self): """The world size is read from the environment variable `JSM_NAMESPACE_SIZE`.""" @@ -126,17 +126,17 @@ def _read_hosts(): ) return hosts - def _get_master_address(self): + def _get_main_address(self): hosts = self._read_hosts() return hosts[1] @staticmethod - def _get_master_port(): - """A helper function for accessing the master port. + def _get_main_port(): + """A helper function for accessing the main port. - Uses the LSF job ID so all ranks can compute the master port. + Uses the LSF job ID so all ranks can compute the main port. """ - # check for user-specified master port + # check for user-specified main port port = os.environ.get("MASTER_PORT") if not port: jobid = os.environ.get("LSB_JOBID") @@ -145,7 +145,7 @@ def _get_master_port(): port = int(jobid) # all ports should be in the 10k+ range port = int(port) % 1000 + 10000 - log.debug(f"calculated LSF master port: {port}") + log.debug(f"calculated LSF main port: {port}") else: - log.debug(f"using externally specified master port: {port}") + log.debug(f"using externally specified main port: {port}") return int(port) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index fb5c3d1e92cbf..71cbaff3d3c5d 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -27,7 +27,7 @@ class SLURMEnvironment(ClusterEnvironment): def creates_children(self) -> bool: return True - def master_address(self) -> str: + def main_address(self) -> str: # figure out the root node addr slurm_nodelist = os.environ.get("SLURM_NODELIST") if slurm_nodelist: @@ -40,7 +40,7 @@ def master_address(self) -> str: log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") return root_node - def master_port(self) -> int: + def main_port(self) -> int: # ----------------------- # SLURM JOB = PORT number # ----------------------- diff --git a/pytorch_lightning/plugins/environments/torchelastic_environment.py b/pytorch_lightning/plugins/environments/torchelastic_environment.py index f6c0335ac5cc8..3c66f06bcf233 100644 --- a/pytorch_lightning/plugins/environments/torchelastic_environment.py +++ b/pytorch_lightning/plugins/environments/torchelastic_environment.py @@ -34,7 +34,7 @@ def is_using_torchelastic() -> bool: def creates_children(self) -> bool: return True - def master_address(self) -> str: + def main_address(self) -> str: if "MASTER_ADDR" not in os.environ: rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost") os.environ["MASTER_ADDR"] = "127.0.0.1" @@ -42,7 +42,7 @@ def master_address(self) -> str: master_address = os.environ.get("MASTER_ADDR") return master_address - def master_port(self) -> int: + def main_port(self) -> int: if "MASTER_PORT" not in os.environ: rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910") os.environ["MASTER_PORT"] = "12910" diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 4499c1d7dfc41..cf9a3bb90a32e 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -194,8 +194,8 @@ def _call_children_scripts(self): self._check_can_spawn_children() # DDP Environment variables - os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address() + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index c72cc7f31d0cc..bdb2cbe820885 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -143,7 +143,7 @@ def _is_single_process_single_device(self): return True def setup(self) -> None: - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) # pass in a state q smp = mp.get_context("spawn") self.mp_queue = smp.SimpleQueue() @@ -184,7 +184,7 @@ def spawn(self, function: Callable, *args: Any, **kwargs: Any) -> None: **kwargs: Optional named arguments that will be passed to the function in addition to the process index. These arguments must be pickleable. """ - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) mp.spawn(self._wrapped_function, args=(function, args, kwargs), **self.get_mp_spawn_kwargs()) def _wrapped_function(self, process_idx: int, function: Callable, args: Any, kwargs: Any) -> None: diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 49e2f7e8a60df..ba7c716d94dcb 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -361,12 +361,12 @@ def _init_deepspeed_distributed(self) -> None: f"MEMBER: {self.global_rank + 1}/{self.world_size}" ) deepspeed.init_distributed( - self.torch_distributed_backend, distributed_port=self.cluster_environment.master_port() + self.torch_distributed_backend, distributed_port=self.cluster_environment.main_port() ) def _set_node_environment_variables(self) -> None: - os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address() + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) os.environ["RANK"] = str(self.global_rank) os.environ["WORLD_SIZE"] = str(self.world_size) os.environ["LOCAL_RANK"] = str(self.local_rank) diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index f8968a69ceed1..20e47fc374541 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -302,7 +302,7 @@ def test_step_end(self, output: STEP_OUTPUT) -> STEP_OUTPUT: def _pod_progress_bar_force_stdout(self) -> None: # Why is it required? The way `pytorch_xla.distributed` streams logs - # from different vms to the master worker doesn't work well with tqdm + # from different vms to the main worker doesn't work well with tqdm # Ref: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/xla_dist.py#L140 # The print statement seems to force tqdm to flush stdout. if self.tpu_global_core_rank == 0 and int(os.getenv(xenv.TPUVM_MODE, 0)) == 1: diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 95c74d4a87b70..f5c5d47d157d0 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -185,7 +185,7 @@ def results(self) -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]: The result is cached instead of returned directly, because some plugins require transmitting the results from one multiprocessing context to another in a separate step. For example, the plugins that use the "spawn" - start-method send the result to the master process through a + start-method send the result to the main process through a `multiprocessing queue (shared memory) `_. """ return self._results diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 3db371b252490..6426802ed839e 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -162,7 +162,7 @@ def sync_ddp_if_available( def sync_ddp( result: torch.Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None ) -> torch.Tensor: - """Function to reduce the tensors from several ddp processes to one master process. + """Function to reduce the tensors from several ddp processes to one main process. Args: result: the value to sync and reduce (typically tensor or number) @@ -379,8 +379,8 @@ def init_ddp_connection( """ global_rank = global_rank if global_rank is not None else cluster_environment.global_rank() world_size = world_size if world_size is not None else cluster_environment.world_size() - os.environ["MASTER_ADDR"] = cluster_environment.master_address() - os.environ["MASTER_PORT"] = str(cluster_environment.master_port()) + os.environ["MASTER_ADDR"] = cluster_environment.main_address() + os.environ["MASTER_PORT"] = str(cluster_environment.main_port()) if torch.distributed.is_available() and not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch.distributed.init_process_group( diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index ca91b61fa121d..728fca6588182 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -366,7 +366,7 @@ def test_accelerator_choice_ddp_cpu_custom_cluster(_, tmpdir): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" class CustomCluster(LightningEnvironment): - def master_address(self): + def main_address(self): return "asdf" def creates_children(self) -> bool: diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 6db8965c12d00..def7426e5e953 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -278,7 +278,7 @@ def test_accelerator_set_when_using_tpu(tmpdir, tpu_cores): @RunIf(tpu=True) @pl_multi_process_test def test_broadcast_on_tpu(): - """Checks if an object from the master process is broadcasted to other processes correctly.""" + """Checks if an object from the main process is broadcasted to other processes correctly.""" def test_broadcast(rank): trainer = Trainer(tpu_cores=8) diff --git a/tests/plugins/environments/test_kubeflow_environment.py b/tests/plugins/environments/test_kubeflow_environment.py index 29b8bba587922..1064e66510b3b 100644 --- a/tests/plugins/environments/test_kubeflow_environment.py +++ b/tests/plugins/environments/test_kubeflow_environment.py @@ -28,10 +28,10 @@ def test_default_attributes(): with pytest.raises(KeyError): # MASTER_ADDR is required - env.master_address() + env.main_address() with pytest.raises(KeyError): # MASTER_PORT is required - env.master_port() + env.main_port() with pytest.raises(KeyError): # WORLD_SIZE is required env.world_size() @@ -54,8 +54,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = KubeflowEnvironment() - assert env.master_address() == "1.2.3.4" - assert env.master_port() == 500 + assert env.main_address() == "1.2.3.4" + assert env.main_port() == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 0 diff --git a/tests/plugins/environments/test_lightning_environment.py b/tests/plugins/environments/test_lightning_environment.py index 012aa877ec57b..f3d1d46592b6b 100644 --- a/tests/plugins/environments/test_lightning_environment.py +++ b/tests/plugins/environments/test_lightning_environment.py @@ -24,8 +24,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = LightningEnvironment() assert not env.creates_children() - assert env.master_address() == "127.0.0.1" - assert isinstance(env.master_port(), int) + assert env.main_address() == "127.0.0.1" + assert isinstance(env.main_port(), int) assert env.world_size() == 1 assert env.local_rank() == 0 assert env.node_rank() == 0 @@ -35,8 +35,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(): """Test that the default cluster environment takes the attributes from the environment variables.""" env = LightningEnvironment() - assert env.master_address() == "1.2.3.4" - assert env.master_port() == 500 + assert env.main_address() == "1.2.3.4" + assert env.main_port() == 500 assert env.world_size() == 1 assert env.global_rank() == 0 assert env.local_rank() == 2 @@ -67,12 +67,12 @@ def test_node_rank_from_group_rank(): @mock.patch.dict(os.environ, {}) def test_random_master_port(): - """Test randomly chosen master port when no master port was given by user.""" + """Test randomly chosen main port when no main port was given by user.""" env = LightningEnvironment() - port = env.master_port() + port = env.main_port() assert isinstance(port, int) # repeated calls do not generate a new port number - assert env.master_port() == port + assert env.main_port() == port @mock.patch.dict(os.environ, {"WORLD_SIZE": "1"}) diff --git a/tests/plugins/environments/test_lsf_environment.py b/tests/plugins/environments/test_lsf_environment.py index fbdb2197bdf51..de6f1e8d61779 100644 --- a/tests/plugins/environments/test_lsf_environment.py +++ b/tests/plugins/environments/test_lsf_environment.py @@ -36,10 +36,10 @@ def test_missing_lsb_job_id(): @mock.patch.dict(os.environ, {"MASTER_PORT": "4321", "LSB_JOBID": "1234", "LSB_HOSTS": "batch 10.10.10.0 10.10.10.1"}) -def test_manual_master_port_and_address(): +def test_manual_main_port_and_address(): """Test a user can set the port manually through the MASTER_PORT env variable.""" env = LSFEnvironment() - assert env.master_port() == 4321 + assert env.main_port() == 4321 @mock.patch.dict( @@ -56,8 +56,8 @@ def test_attributes_from_environment_variables(): """Test that the LSF environment takes the attributes from the environment variables.""" env = LSFEnvironment() assert env.creates_children() - assert env.master_address() == "10.10.10.0" - assert env.master_port() == 10234 + assert env.main_address() == "10.10.10.0" + assert env.main_port() == 10234 assert env.world_size() == 4 assert env.global_rank() == 3 assert env.local_rank() == 1 diff --git a/tests/plugins/environments/test_slurm_environment.py b/tests/plugins/environments/test_slurm_environment.py index 111d317b3a5fa..d627a2f71b5e2 100644 --- a/tests/plugins/environments/test_slurm_environment.py +++ b/tests/plugins/environments/test_slurm_environment.py @@ -25,8 +25,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = SLURMEnvironment() assert env.creates_children() - assert env.master_address() == "127.0.0.1" - assert env.master_port() == 12910 + assert env.main_address() == "127.0.0.1" + assert env.main_port() == 12910 with pytest.raises(KeyError): # world size is required to be passed as env variable env.world_size() @@ -52,8 +52,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the SLURM cluster environment takes the attributes from the environment variables.""" env = SLURMEnvironment() - assert env.master_address() == "1.1.1.1" - assert env.master_port() == 15000 + 1234 + assert env.main_address() == "1.1.1.1" + assert env.main_port() == 15000 + 1234 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 @@ -77,7 +77,7 @@ def test_attributes_from_environment_variables(caplog): [("alpha,beta,gamma", "alpha"), ("alpha beta gamma", "alpha"), ("1.2.3.[100-110]", "1.2.3.100")], ) def test_master_address_from_slurm_node_list(slurm_node_list, expected): - """Test extracting the master node from different formats for the SLURM_NODELIST.""" + """Test extracting the main node from different formats for the SLURM_NODELIST.""" with mock.patch.dict(os.environ, {"SLURM_NODELIST": slurm_node_list}): env = SLURMEnvironment() - assert env.master_address() == expected + assert env.main_address() == expected diff --git a/tests/plugins/environments/test_torchelastic_environment.py b/tests/plugins/environments/test_torchelastic_environment.py index 19e64fba2e65f..4c6ed3903fee7 100644 --- a/tests/plugins/environments/test_torchelastic_environment.py +++ b/tests/plugins/environments/test_torchelastic_environment.py @@ -25,8 +25,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = TorchElasticEnvironment() assert env.creates_children() - assert env.master_address() == "127.0.0.1" - assert env.master_port() == 12910 + assert env.main_address() == "127.0.0.1" + assert env.main_port() == 12910 assert env.world_size() is None with pytest.raises(KeyError): # local rank is required to be passed as env variable @@ -48,8 +48,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = TorchElasticEnvironment() - assert env.master_address() == "1.2.3.4" - assert env.master_port() == 500 + assert env.main_address() == "1.2.3.4" + assert env.main_port() == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 7c0623323f6f1..6678892144c3a 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -819,8 +819,8 @@ def test_deepspeed_plugin_env_variables(mock_deepspeed_distributed, tmpdir, plat # assert no env variables have been set within the DeepSpeedPlugin assert all(k not in os.environ for k in ("MASTER_PORT", "MASTER_ADDR", "RANK", "WORLD_SIZE", "LOCAL_RANK")) else: - assert os.environ["MASTER_ADDR"] == str(trainer.training_type_plugin.cluster_environment.master_address()) - assert os.environ["MASTER_PORT"] == str(trainer.training_type_plugin.cluster_environment.master_port()) + assert os.environ["MASTER_ADDR"] == str(trainer.training_type_plugin.cluster_environment.main_address()) + assert os.environ["MASTER_PORT"] == str(trainer.training_type_plugin.cluster_environment.main_port()) assert os.environ["RANK"] == str(trainer.training_type_plugin.global_rank) assert os.environ["WORLD_SIZE"] == str(trainer.training_type_plugin.world_size) assert os.environ["LOCAL_RANK"] == str(trainer.training_type_plugin.local_rank) From 4eae8f4ec5f91373bc6252d498bd534f0381fd98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 24 Oct 2021 18:04:26 +0200 Subject: [PATCH 02/18] rename properties --- .../plugins/environments/cluster_environment.py | 2 ++ pytorch_lightning/plugins/training_type/ddp.py | 4 ++-- pytorch_lightning/plugins/training_type/ddp_spawn.py | 4 ++-- pytorch_lightning/plugins/training_type/deepspeed.py | 8 +++----- pytorch_lightning/utilities/distributed.py | 4 ++-- .../environments/test_kubeflow_environment.py | 8 ++++---- .../environments/test_lightning_environment.py | 12 ++++++------ tests/plugins/environments/test_lsf_environment.py | 6 +++--- tests/plugins/environments/test_slurm_environment.py | 10 +++++----- .../environments/test_torchelastic_environment.py | 8 ++++---- tests/plugins/test_deepspeed_plugin.py | 4 ++-- 11 files changed, 35 insertions(+), 35 deletions(-) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 43d86feb2745e..5c95d3291d40a 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -21,10 +21,12 @@ class ClusterEnvironment(ABC): def creates_children(self) -> bool: """Whether the environment creates the subprocesses or not.""" + @property @abstractmethod def main_address(self) -> str: """The main address through which all processes connect and communicate.""" + @property @abstractmethod def main_port(self) -> int: """An open and configured port in the main node through which all processes communicate.""" diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index cf9a3bb90a32e..d65204632d0d4 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -194,8 +194,8 @@ def _call_children_scripts(self): self._check_can_spawn_children() # DDP Environment variables - os.environ["MASTER_ADDR"] = self.cluster_environment.main_address() - os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index bdb2cbe820885..fb9c498a4b84f 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -143,7 +143,7 @@ def _is_single_process_single_device(self): return True def setup(self) -> None: - os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) # pass in a state q smp = mp.get_context("spawn") self.mp_queue = smp.SimpleQueue() @@ -184,7 +184,7 @@ def spawn(self, function: Callable, *args: Any, **kwargs: Any) -> None: **kwargs: Optional named arguments that will be passed to the function in addition to the process index. These arguments must be pickleable. """ - os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) mp.spawn(self._wrapped_function, args=(function, args, kwargs), **self.get_mp_spawn_kwargs()) def _wrapped_function(self, process_idx: int, function: Callable, args: Any, kwargs: Any) -> None: diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index ba7c716d94dcb..8c7a415a3f023 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -360,13 +360,11 @@ def _init_deepspeed_distributed(self) -> None: f"GLOBAL_RANK: {self.global_rank}, " f"MEMBER: {self.global_rank + 1}/{self.world_size}" ) - deepspeed.init_distributed( - self.torch_distributed_backend, distributed_port=self.cluster_environment.main_port() - ) + deepspeed.init_distributed(self.torch_distributed_backend, distributed_port=self.cluster_environment.main_port) def _set_node_environment_variables(self) -> None: - os.environ["MASTER_ADDR"] = self.cluster_environment.main_address() - os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port()) + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) os.environ["RANK"] = str(self.global_rank) os.environ["WORLD_SIZE"] = str(self.world_size) os.environ["LOCAL_RANK"] = str(self.local_rank) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 6426802ed839e..45be22c36facc 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -379,8 +379,8 @@ def init_ddp_connection( """ global_rank = global_rank if global_rank is not None else cluster_environment.global_rank() world_size = world_size if world_size is not None else cluster_environment.world_size() - os.environ["MASTER_ADDR"] = cluster_environment.main_address() - os.environ["MASTER_PORT"] = str(cluster_environment.main_port()) + os.environ["MASTER_ADDR"] = cluster_environment.main_address + os.environ["MASTER_PORT"] = str(cluster_environment.main_port) if torch.distributed.is_available() and not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch.distributed.init_process_group( diff --git a/tests/plugins/environments/test_kubeflow_environment.py b/tests/plugins/environments/test_kubeflow_environment.py index 1064e66510b3b..148fa157a0fee 100644 --- a/tests/plugins/environments/test_kubeflow_environment.py +++ b/tests/plugins/environments/test_kubeflow_environment.py @@ -28,10 +28,10 @@ def test_default_attributes(): with pytest.raises(KeyError): # MASTER_ADDR is required - env.main_address() + env.main_address with pytest.raises(KeyError): # MASTER_PORT is required - env.main_port() + env.main_port with pytest.raises(KeyError): # WORLD_SIZE is required env.world_size() @@ -54,8 +54,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = KubeflowEnvironment() - assert env.main_address() == "1.2.3.4" - assert env.main_port() == 500 + assert env.main_address == "1.2.3.4" + assert env.main_port == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 0 diff --git a/tests/plugins/environments/test_lightning_environment.py b/tests/plugins/environments/test_lightning_environment.py index f3d1d46592b6b..41fc12d9f5ae4 100644 --- a/tests/plugins/environments/test_lightning_environment.py +++ b/tests/plugins/environments/test_lightning_environment.py @@ -24,8 +24,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = LightningEnvironment() assert not env.creates_children() - assert env.main_address() == "127.0.0.1" - assert isinstance(env.main_port(), int) + assert env.main_address == "127.0.0.1" + assert isinstance(env.main_port, int) assert env.world_size() == 1 assert env.local_rank() == 0 assert env.node_rank() == 0 @@ -35,8 +35,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(): """Test that the default cluster environment takes the attributes from the environment variables.""" env = LightningEnvironment() - assert env.main_address() == "1.2.3.4" - assert env.main_port() == 500 + assert env.main_address == "1.2.3.4" + assert env.main_port == 500 assert env.world_size() == 1 assert env.global_rank() == 0 assert env.local_rank() == 2 @@ -69,10 +69,10 @@ def test_node_rank_from_group_rank(): def test_random_master_port(): """Test randomly chosen main port when no main port was given by user.""" env = LightningEnvironment() - port = env.main_port() + port = env.main_port assert isinstance(port, int) # repeated calls do not generate a new port number - assert env.main_port() == port + assert env.main_port == port @mock.patch.dict(os.environ, {"WORLD_SIZE": "1"}) diff --git a/tests/plugins/environments/test_lsf_environment.py b/tests/plugins/environments/test_lsf_environment.py index de6f1e8d61779..997c3a4236053 100644 --- a/tests/plugins/environments/test_lsf_environment.py +++ b/tests/plugins/environments/test_lsf_environment.py @@ -39,7 +39,7 @@ def test_missing_lsb_job_id(): def test_manual_main_port_and_address(): """Test a user can set the port manually through the MASTER_PORT env variable.""" env = LSFEnvironment() - assert env.main_port() == 4321 + assert env.main_port == 4321 @mock.patch.dict( @@ -56,8 +56,8 @@ def test_attributes_from_environment_variables(): """Test that the LSF environment takes the attributes from the environment variables.""" env = LSFEnvironment() assert env.creates_children() - assert env.main_address() == "10.10.10.0" - assert env.main_port() == 10234 + assert env.main_address == "10.10.10.0" + assert env.main_port == 10234 assert env.world_size() == 4 assert env.global_rank() == 3 assert env.local_rank() == 1 diff --git a/tests/plugins/environments/test_slurm_environment.py b/tests/plugins/environments/test_slurm_environment.py index d627a2f71b5e2..581f9dd3cb440 100644 --- a/tests/plugins/environments/test_slurm_environment.py +++ b/tests/plugins/environments/test_slurm_environment.py @@ -25,8 +25,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = SLURMEnvironment() assert env.creates_children() - assert env.main_address() == "127.0.0.1" - assert env.main_port() == 12910 + assert env.main_address == "127.0.0.1" + assert env.main_port == 12910 with pytest.raises(KeyError): # world size is required to be passed as env variable env.world_size() @@ -52,8 +52,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the SLURM cluster environment takes the attributes from the environment variables.""" env = SLURMEnvironment() - assert env.main_address() == "1.1.1.1" - assert env.main_port() == 15000 + 1234 + assert env.main_address == "1.1.1.1" + assert env.main_port == 15000 + 1234 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 @@ -80,4 +80,4 @@ def test_master_address_from_slurm_node_list(slurm_node_list, expected): """Test extracting the main node from different formats for the SLURM_NODELIST.""" with mock.patch.dict(os.environ, {"SLURM_NODELIST": slurm_node_list}): env = SLURMEnvironment() - assert env.main_address() == expected + assert env.main_address == expected diff --git a/tests/plugins/environments/test_torchelastic_environment.py b/tests/plugins/environments/test_torchelastic_environment.py index 4c6ed3903fee7..f16d4797b68b2 100644 --- a/tests/plugins/environments/test_torchelastic_environment.py +++ b/tests/plugins/environments/test_torchelastic_environment.py @@ -25,8 +25,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = TorchElasticEnvironment() assert env.creates_children() - assert env.main_address() == "127.0.0.1" - assert env.main_port() == 12910 + assert env.main_address == "127.0.0.1" + assert env.main_port == 12910 assert env.world_size() is None with pytest.raises(KeyError): # local rank is required to be passed as env variable @@ -48,8 +48,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = TorchElasticEnvironment() - assert env.main_address() == "1.2.3.4" - assert env.main_port() == 500 + assert env.main_address == "1.2.3.4" + assert env.main_port == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 6678892144c3a..97ac9723a9344 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -819,8 +819,8 @@ def test_deepspeed_plugin_env_variables(mock_deepspeed_distributed, tmpdir, plat # assert no env variables have been set within the DeepSpeedPlugin assert all(k not in os.environ for k in ("MASTER_PORT", "MASTER_ADDR", "RANK", "WORLD_SIZE", "LOCAL_RANK")) else: - assert os.environ["MASTER_ADDR"] == str(trainer.training_type_plugin.cluster_environment.main_address()) - assert os.environ["MASTER_PORT"] == str(trainer.training_type_plugin.cluster_environment.main_port()) + assert os.environ["MASTER_ADDR"] == str(trainer.training_type_plugin.cluster_environment.main_address) + assert os.environ["MASTER_PORT"] == str(trainer.training_type_plugin.cluster_environment.main_port) assert os.environ["RANK"] == str(trainer.training_type_plugin.global_rank) assert os.environ["WORLD_SIZE"] == str(trainer.training_type_plugin.world_size) assert os.environ["LOCAL_RANK"] == str(trainer.training_type_plugin.local_rank) From 66d77ec225c75ec2460b68b8375507e562e22c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 24 Oct 2021 18:06:17 +0200 Subject: [PATCH 03/18] add property decorators --- .../plugins/environments/cluster_environment.py | 8 ++++---- .../plugins/environments/kubeflow_environment.py | 2 ++ .../plugins/environments/lightning_environment.py | 2 ++ pytorch_lightning/plugins/environments/lsf_environment.py | 2 ++ .../plugins/environments/slurm_environment.py | 2 ++ .../plugins/environments/torchelastic_environment.py | 2 ++ tests/accelerators/test_accelerator_connector.py | 1 + 7 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 5c95d3291d40a..720334f5c1a71 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -17,10 +17,6 @@ class ClusterEnvironment(ABC): """Specification of a cluster environment.""" - @abstractmethod - def creates_children(self) -> bool: - """Whether the environment creates the subprocesses or not.""" - @property @abstractmethod def main_address(self) -> str: @@ -31,6 +27,10 @@ def main_address(self) -> str: def main_port(self) -> int: """An open and configured port in the main node through which all processes communicate.""" + @abstractmethod + def creates_children(self) -> bool: + """Whether the environment creates the subprocesses or not.""" + @abstractmethod def world_size(self) -> int: """The number of processes across all devices and nodes.""" diff --git a/pytorch_lightning/plugins/environments/kubeflow_environment.py b/pytorch_lightning/plugins/environments/kubeflow_environment.py index 3f8f2c9b7af65..305b1d935787b 100644 --- a/pytorch_lightning/plugins/environments/kubeflow_environment.py +++ b/pytorch_lightning/plugins/environments/kubeflow_environment.py @@ -38,9 +38,11 @@ def is_using_kubeflow() -> bool: def creates_children(self) -> bool: return True + @property def main_address(self) -> str: return os.environ["MASTER_ADDR"] + @property def main_port(self) -> int: return int(os.environ["MASTER_PORT"]) diff --git a/pytorch_lightning/plugins/environments/lightning_environment.py b/pytorch_lightning/plugins/environments/lightning_environment.py index 071b89cf46433..082fde9f61700 100644 --- a/pytorch_lightning/plugins/environments/lightning_environment.py +++ b/pytorch_lightning/plugins/environments/lightning_environment.py @@ -48,9 +48,11 @@ def creates_children(self) -> bool: """ return "LOCAL_RANK" in os.environ + @property def main_address(self) -> str: return os.environ.get("MASTER_ADDR", "127.0.0.1") + @property def main_port(self) -> int: if self._main_port is None: self._main_port = os.environ.get("MASTER_PORT", find_free_network_port()) diff --git a/pytorch_lightning/plugins/environments/lsf_environment.py b/pytorch_lightning/plugins/environments/lsf_environment.py index 368c6f067e150..d42842ad1cc7c 100644 --- a/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/pytorch_lightning/plugins/environments/lsf_environment.py @@ -55,10 +55,12 @@ def is_using_lsf() -> bool: def creates_children(self) -> bool: return True + @property def main_address(self): """The main address is read from a list of hosts contained in the environment variable `LSB_HOSTS`.""" return self._main_address + @property def main_port(self): """THe main port gets calculated from the LSF job ID.""" return self._main_port diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index 71cbaff3d3c5d..619d4a317709c 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -27,6 +27,7 @@ class SLURMEnvironment(ClusterEnvironment): def creates_children(self) -> bool: return True + @property def main_address(self) -> str: # figure out the root node addr slurm_nodelist = os.environ.get("SLURM_NODELIST") @@ -40,6 +41,7 @@ def main_address(self) -> str: log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") return root_node + @property def main_port(self) -> int: # ----------------------- # SLURM JOB = PORT number diff --git a/pytorch_lightning/plugins/environments/torchelastic_environment.py b/pytorch_lightning/plugins/environments/torchelastic_environment.py index 3c66f06bcf233..985791b1db5eb 100644 --- a/pytorch_lightning/plugins/environments/torchelastic_environment.py +++ b/pytorch_lightning/plugins/environments/torchelastic_environment.py @@ -34,6 +34,7 @@ def is_using_torchelastic() -> bool: def creates_children(self) -> bool: return True + @property def main_address(self) -> str: if "MASTER_ADDR" not in os.environ: rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost") @@ -42,6 +43,7 @@ def main_address(self) -> str: master_address = os.environ.get("MASTER_ADDR") return master_address + @property def main_port(self) -> int: if "MASTER_PORT" not in os.environ: rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910") diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 728fca6588182..937b61fa47e72 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -366,6 +366,7 @@ def test_accelerator_choice_ddp_cpu_custom_cluster(_, tmpdir): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" class CustomCluster(LightningEnvironment): + @property def main_address(self): return "asdf" From 7e86ce74faa5b24b6c77baa8d0727569152eb172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 24 Oct 2021 18:10:59 +0200 Subject: [PATCH 04/18] occurrences in docs --- docs/source/clouds/cluster.rst | 4 ++-- docs/source/common/trainer.rst | 4 ++-- docs/source/guides/speed.rst | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/clouds/cluster.rst b/docs/source/clouds/cluster.rst index 40eaaab1fc11a..afb3b527e2bcc 100644 --- a/docs/source/clouds/cluster.rst +++ b/docs/source/clouds/cluster.rst @@ -82,7 +82,7 @@ Once the script is setup like described in :ref:`training_script_setup`, you can Like a custom cluster, you have to ensure that there is network connectivity between the nodes with firewall rules that allow traffic flow on a specified *MASTER_PORT*. -Finally, you'll need to decide which node you'd like to be the master node (*MASTER_ADDR*), and the ranks of each node (*NODE_RANK*). +Finally, you'll need to decide which node you'd like to be the main node (*MASTER_ADDR*), and the ranks of each node (*NODE_RANK*). For example: @@ -248,7 +248,7 @@ See also the multi-node examples # NCCL is how the nodes talk to each other cluster.add_command("export NCCL_DEBUG=INFO") - # setting a master port here is a good idea. + # setting a main port here is a good idea. cluster.add_command("export MASTER_PORT=%r" % PORT) # ************** DON'T FORGET THIS *************** diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index 2934e980501d6..554f6b19472ae 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1078,10 +1078,10 @@ To define your own behavior, subclass the relevant class and pass it in. Here's class MyCluster(ClusterEnvironment): def main_address(self): - return your_master_address + return your_main_address def main_port(self): - return your_master_port + return your_main_port def world_size(self): return the_world_size diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst index 0f5a34e69930a..f19ffd1c09446 100644 --- a/docs/source/guides/speed.rst +++ b/docs/source/guides/speed.rst @@ -71,7 +71,7 @@ Prefer DDP over DP 1. Copy model to device. 2. Copy data to device. -3. Copy outputs of each device back to master. +3. Copy outputs of each device back to main device. Whereas :class:`~pytorch_lightning.plugins.training_type.DDPPlugin` only performs 1 transfer to sync gradients, making DDP MUCH faster than DP. From 9e544d84f8d9e1606bffc44d8a609b0eb69f672e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 24 Oct 2021 18:14:16 +0200 Subject: [PATCH 05/18] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 655484292ee59..63e8e78de602f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -334,6 +334,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated several places in the loops and trainer to access `training_type_plugin` directly instead of `accelerator` ([#9901](https://github.com/PyTorchLightning/pytorch-lightning/pull/9901)) +- Renamed `ClusterEnvironment.master_address` to `ClusterEnvironment.main_address`, `ClusterEnvironment.master_port` to `ClusterEnvironment.main_port`, and made both of them properties instead of functions ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/pull/10103)) + ### Deprecated From 546b45d8da0a15a29f696345b57f7e3c872b7402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 24 Oct 2021 18:15:29 +0200 Subject: [PATCH 06/18] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63e8e78de602f..4980b4f714d43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -334,7 +334,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated several places in the loops and trainer to access `training_type_plugin` directly instead of `accelerator` ([#9901](https://github.com/PyTorchLightning/pytorch-lightning/pull/9901)) -- Renamed `ClusterEnvironment.master_address` to `ClusterEnvironment.main_address`, `ClusterEnvironment.master_port` to `ClusterEnvironment.main_port`, and made both of them properties instead of functions ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/pull/10103)) +- Renamed `ClusterEnvironment.master_address` to `ClusterEnvironment.main_address`, `ClusterEnvironment.master_port` to `ClusterEnvironment.main_port`, and made both of them properties instead of methods ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/pull/10103)) ### Deprecated From b463d86b55ffb9b1495d36dd15f930928f8f4e40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 24 Oct 2021 16:17:04 +0000 Subject: [PATCH 07/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4980b4f714d43..158f333682ab3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -334,7 +334,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated several places in the loops and trainer to access `training_type_plugin` directly instead of `accelerator` ([#9901](https://github.com/PyTorchLightning/pytorch-lightning/pull/9901)) -- Renamed `ClusterEnvironment.master_address` to `ClusterEnvironment.main_address`, `ClusterEnvironment.master_port` to `ClusterEnvironment.main_port`, and made both of them properties instead of methods ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/pull/10103)) +- Renamed `ClusterEnvironment.master_address` to `ClusterEnvironment.main_address`, `ClusterEnvironment.master_port` to `ClusterEnvironment.main_port`, and made both of them properties instead of methods ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/pull/10103)) ### Deprecated From ecc8f942bc9381e9bd93f485c26dcb4419c0ec0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Nov 2021 18:16:22 +0100 Subject: [PATCH 08/18] add lost method --- .../plugins/environments/cluster_environment.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 6505bf0cc8efd..69dd56b1889a7 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -37,6 +37,11 @@ def creates_children(self) -> bool: ) return self.creates_processes_externally + @property + @abstractmethod + def main_address(self) -> str: + """The main address through which all processes connect and communicate.""" + @property @abstractmethod def main_port(self) -> int: From 15445a09c0b3710ad8318ed873cf6a09cc6ada67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Nov 2021 18:16:45 +0100 Subject: [PATCH 09/18] create deprecation --- .../environments/cluster_environment.py | 18 ++++++++ tests/deprecated_api/test_remove_1-7.py | 46 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 69dd56b1889a7..395f9305ef58e 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC, abstractmethod +from typing import Type, Any from pytorch_lightning.utilities import rank_zero_deprecation @@ -19,6 +20,10 @@ class ClusterEnvironment(ABC): """Specification of a cluster environment.""" + def __new__(cls, *args: Any, **kwargs: Any) -> "ClusterEnvironment": + _check_for_deprecated_methods(cls) + return super(ClusterEnvironment, cls).__new__(cls, *args, **kwargs) + @property @abstractmethod def creates_processes_externally(self) -> bool: @@ -74,3 +79,16 @@ def node_rank(self) -> int: def teardown(self) -> None: """Clean up any state set after execution finishes.""" pass + + +def _check_for_deprecated_methods(cls: Type[ClusterEnvironment]) -> None: + if hasattr(cls, "master_address") and callable(cls.master_address): + rank_zero_deprecation( + f"`{cls.__name__}.master_address` has been deprecated in v1.6 and will be removed in 1.7." + " Implement the property `main_address` instead (do not forget to add the `@property` decorator." + ) + if hasattr(cls, "master_port") and callable(cls.master_port): + rank_zero_deprecation( + f"`{cls.__name__}.master_port` has been deprecated in v1.6 and will be removed in 1.7." + " Implement the property `main_port` instead (do not forget to add the `@property` decorator." + ) diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 16c511b6effd9..ec44d9842ce2a 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -23,6 +23,12 @@ from pytorch_lightning.callbacks.progress import ProgressBar from pytorch_lightning.callbacks.xla_stats_monitor import XLAStatsMonitor from pytorch_lightning.loggers import LoggerCollection, TestTubeLogger +from pytorch_lightning.plugins.environments import ( + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, +) from tests.callbacks.test_callbacks import OldStatefulCallback from tests.deprecated_api import _soft_unimport_module from tests.helpers import BoringModel @@ -455,3 +461,43 @@ def test_v1_7_0_deprecate_lr_sch_names(tmpdir): with pytest.deprecated_call(match="`LearningRateMonitor.lr_sch_names` has been deprecated in v1.5"): assert lr_monitor.lr_sch_names == ["lr-SGD"] + + +@pytest.mark.parametrize( + "cls", + [ + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, + ], +) +def test_v1_7_0_cluster_environment_master_address(cls): + class MyClusterEnvironment(cls): + def master_address(self): + pass + + with pytest.deprecated_call( + match="MyClusterEnvironment.master_address` has been deprecated in v1.6 and will be removed in 1.7" + ): + MyClusterEnvironment() + + +@pytest.mark.parametrize( + "cls", + [ + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, + ], +) +def test_v1_7_0_cluster_environment_master_port(cls): + class MyClusterEnvironment(cls): + def master_port(self): + pass + + with pytest.deprecated_call( + match="MyClusterEnvironment.master_port` has been deprecated in v1.6 and will be removed in 1.7" + ): + MyClusterEnvironment() From 77aca7c99758ce0471440f47e7657f97c93a195d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Nov 2021 18:20:04 +0100 Subject: [PATCH 10/18] add changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eabd9329f0c73..8b6bd5b25f737 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,7 +41,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated -- +- Deprecated `ClusterEnvironment.master_{address,port}` in favor of `ClusterEnvironment.main_{address,port}` ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/issues/10103)) - From 8d287e8e3ca818d087065df470ea4fef337f4615 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Nov 2021 17:21:41 +0000 Subject: [PATCH 11/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/plugins/environments/cluster_environment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 395f9305ef58e..2ea50159904d5 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC, abstractmethod -from typing import Type, Any +from typing import Any, Type from pytorch_lightning.utilities import rank_zero_deprecation @@ -22,7 +22,7 @@ class ClusterEnvironment(ABC): def __new__(cls, *args: Any, **kwargs: Any) -> "ClusterEnvironment": _check_for_deprecated_methods(cls) - return super(ClusterEnvironment, cls).__new__(cls, *args, **kwargs) + return super().__new__(cls, *args, **kwargs) @property @abstractmethod From e99198e4ec64cbb1ce045b8826ddedf9261088e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Nov 2021 18:27:42 +0100 Subject: [PATCH 12/18] fix typo (but it was already there!!!) --- pytorch_lightning/plugins/environments/lsf_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/environments/lsf_environment.py b/pytorch_lightning/plugins/environments/lsf_environment.py index 73cb3f42c0ea7..82c4297cdfdfc 100644 --- a/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/pytorch_lightning/plugins/environments/lsf_environment.py @@ -63,7 +63,7 @@ def main_address(self): @property def main_port(self): - """THe main port gets calculated from the LSF job ID.""" + """The main port gets calculated from the LSF job ID.""" return self._main_port def world_size(self): From 8cc37637abe2191d5ee89f24e801d231262ebd28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Nov 2021 18:28:08 +0100 Subject: [PATCH 13/18] Apply suggestions from code review Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pytorch_lightning/plugins/environments/cluster_environment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 2ea50159904d5..b88d2be8ac8ae 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -85,10 +85,10 @@ def _check_for_deprecated_methods(cls: Type[ClusterEnvironment]) -> None: if hasattr(cls, "master_address") and callable(cls.master_address): rank_zero_deprecation( f"`{cls.__name__}.master_address` has been deprecated in v1.6 and will be removed in 1.7." - " Implement the property `main_address` instead (do not forget to add the `@property` decorator." + " Implement the property `main_address` instead (do not forget to add the `@property` decorator)." ) if hasattr(cls, "master_port") and callable(cls.master_port): rank_zero_deprecation( f"`{cls.__name__}.master_port` has been deprecated in v1.6 and will be removed in 1.7." - " Implement the property `main_port` instead (do not forget to add the `@property` decorator." + " Implement the property `main_port` instead (do not forget to add the `@property` decorator)." ) From 372b7c15c205ac11b55757bf3d1f8fcfd2a60aa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Nov 2021 18:29:53 +0100 Subject: [PATCH 14/18] add todo --- pytorch_lightning/plugins/environments/cluster_environment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index b88d2be8ac8ae..36832ea96c007 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -21,6 +21,7 @@ class ClusterEnvironment(ABC): """Specification of a cluster environment.""" def __new__(cls, *args: Any, **kwargs: Any) -> "ClusterEnvironment": + # TODO: remove in 1.7 _check_for_deprecated_methods(cls) return super().__new__(cls, *args, **kwargs) From be51fd30aab52ce90dffbe32196d60a09e8e78fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 4 Nov 2021 01:42:14 +0100 Subject: [PATCH 15/18] update more occurences --- .../plugins/environments/torchelastic_environment.py | 4 ++-- pytorch_lightning/plugins/training_type/ddp.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/environments/torchelastic_environment.py b/pytorch_lightning/plugins/environments/torchelastic_environment.py index 249bfc7fea168..597a01ed206a5 100644 --- a/pytorch_lightning/plugins/environments/torchelastic_environment.py +++ b/pytorch_lightning/plugins/environments/torchelastic_environment.py @@ -41,8 +41,8 @@ def main_address(self) -> str: rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost") os.environ["MASTER_ADDR"] = "127.0.0.1" log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") - master_address = os.environ.get("MASTER_ADDR") - return master_address + main_address = os.environ.get("MASTER_ADDR") + return main_address @property def main_port(self) -> int: diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 99e6c87576a7f..7441bf6b3ef8e 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -85,7 +85,7 @@ class DDPPlugin(ParallelPlugin): """Plugin for multi-process single-device training on one or multiple nodes. - The master process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of + The main process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes. """ From 4c34f8506172cb8d45495e106d7586dfab425ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 4 Nov 2021 02:36:45 +0100 Subject: [PATCH 16/18] add types --- pytorch_lightning/plugins/environments/lsf_environment.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/environments/lsf_environment.py b/pytorch_lightning/plugins/environments/lsf_environment.py index 82c4297cdfdfc..3b67edd8b4091 100644 --- a/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/pytorch_lightning/plugins/environments/lsf_environment.py @@ -57,12 +57,12 @@ def creates_processes_externally(self) -> bool: return True @property - def main_address(self): + def main_address(self) -> str: """The main address is read from a list of hosts contained in the environment variable `LSB_HOSTS`.""" return self._main_address @property - def main_port(self): + def main_port(self) -> int: """The main port gets calculated from the LSF job ID.""" return self._main_port @@ -129,12 +129,12 @@ def _read_hosts(): ) return hosts - def _get_main_address(self): + def _get_main_address(self) -> str: hosts = self._read_hosts() return hosts[1] @staticmethod - def _get_main_port(): + def _get_main_port() -> int: """A helper function for accessing the main port. Uses the LSF job ID so all ranks can compute the main port. From b1b4e78fc15878886292eb57aac6de502be8919e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 5 Nov 2021 11:48:39 +0100 Subject: [PATCH 17/18] add missing import --- pytorch_lightning/plugins/environments/cluster_environment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 3ba0a8cde8b98..2dad5c2a924fc 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -14,6 +14,8 @@ from abc import ABC, abstractmethod from typing import Any, Type +from pytorch_lightning.utilities import rank_zero_deprecation + class ClusterEnvironment(ABC): """Specification of a cluster environment.""" From 4c8f9ad06b19340b20c61092d94f5cef49ef4e1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 8 Nov 2021 12:55:55 +0100 Subject: [PATCH 18/18] fix merge conflict error in base class --- pytorch_lightning/plugins/environments/cluster_environment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index 2dad5c2a924fc..1cf209c897cf4 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -30,6 +30,7 @@ def __new__(cls, *args: Any, **kwargs: Any) -> "ClusterEnvironment": def creates_processes_externally(self) -> bool: """Whether the environment creates the subprocesses or not.""" + @property @abstractmethod def main_address(self) -> str: """The main address through which all processes connect and communicate."""