diff --git a/CHANGELOG.md b/CHANGELOG.md index 09cced9ed9bd7..82623bbf1de00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,7 +41,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated -- +- Deprecated `ClusterEnvironment.master_{address,port}` in favor of `ClusterEnvironment.main_{address,port}` ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/issues/10103)) - diff --git a/docs/source/clouds/cluster.rst b/docs/source/clouds/cluster.rst index 6f1c70b48b448..2d6b4e19d6e98 100644 --- a/docs/source/clouds/cluster.rst +++ b/docs/source/clouds/cluster.rst @@ -82,7 +82,7 @@ Once the script is setup like described in :ref:`training_script_setup`, you can Like a custom cluster, you have to ensure that there is network connectivity between the nodes with firewall rules that allow traffic flow on a specified *MASTER_PORT*. -Finally, you'll need to decide which node you'd like to be the master node (*MASTER_ADDR*), and the ranks of each node (*NODE_RANK*). +Finally, you'll need to decide which node you'd like to be the main node (*MASTER_ADDR*), and the ranks of each node (*NODE_RANK*). For example: @@ -248,7 +248,7 @@ See also the multi-node examples # NCCL is how the nodes talk to each other cluster.add_command("export NCCL_DEBUG=INFO") - # setting a master port here is a good idea. + # setting a main port here is a good idea. cluster.add_command("export MASTER_PORT=%r" % PORT) # ************** DON'T FORGET THIS *************** @@ -307,10 +307,10 @@ and node rank (node id). Here is an example of a custom def node_rank(self) -> int: return int(os.environ["NODE_RANK"]) - def master_address(self) -> str: + def main_address(self) -> str: return os.environ["MASTER_ADDRESS"] - def master_port(self) -> int: + def main_port(self) -> int: return int(os.environ["MASTER_PORT"]) diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index 0738a2b077435..7e336c758f621 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1116,11 +1116,11 @@ To define your own behavior, subclass the relevant class and pass it in. Here's class MyCluster(ClusterEnvironment): - def master_address(self): - return your_master_address + def main_address(self): + return your_main_address - def master_port(self): - return your_master_port + def main_port(self): + return your_main_port def world_size(self): return the_world_size diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst index 0b8edb43b7ec5..576589c670635 100644 --- a/docs/source/guides/speed.rst +++ b/docs/source/guides/speed.rst @@ -71,7 +71,7 @@ Prefer DDP over DP 1. Copy model to device. 2. Copy data to device. -3. Copy outputs of each device back to master. +3. Copy outputs of each device back to main device. Whereas :class:`~pytorch_lightning.plugins.training_type.DDPPlugin` only performs 1 transfer to sync gradients, making DDP MUCH faster than DP. diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 0c3f2bf1901ba..6fb56ea802b1d 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -479,7 +479,7 @@ def results(self) -> Any: This property is deprecated in v1.5 and will be removed in v1.6. Please call `training_type_plugin.results` directly. - In distributed training, we make sure to transfer the results to the appropriate master process. + In distributed training, we make sure to transfer the results to the appropriate main process. """ rank_zero_deprecation( "`Accelerator.results` is deprecated in v1.5 and will be removed in v1.6. " diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py index dd1a43c45756a..1cf209c897cf4 100644 --- a/pytorch_lightning/plugins/environments/cluster_environment.py +++ b/pytorch_lightning/plugins/environments/cluster_environment.py @@ -12,23 +12,33 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC, abstractmethod +from typing import Any, Type + +from pytorch_lightning.utilities import rank_zero_deprecation class ClusterEnvironment(ABC): """Specification of a cluster environment.""" + def __new__(cls, *args: Any, **kwargs: Any) -> "ClusterEnvironment": + # TODO: remove in 1.7 + _check_for_deprecated_methods(cls) + return super().__new__(cls, *args, **kwargs) + @property @abstractmethod def creates_processes_externally(self) -> bool: """Whether the environment creates the subprocesses or not.""" + @property @abstractmethod - def master_address(self) -> str: - """The master address through which all processes connect and communicate.""" + def main_address(self) -> str: + """The main address through which all processes connect and communicate.""" + @property @abstractmethod - def master_port(self) -> int: - """An open and configured port in the master node through which all processes communicate.""" + def main_port(self) -> int: + """An open and configured port in the main node through which all processes communicate.""" @abstractmethod def world_size(self) -> int: @@ -57,3 +67,16 @@ def node_rank(self) -> int: def teardown(self) -> None: """Clean up any state set after execution finishes.""" pass + + +def _check_for_deprecated_methods(cls: Type[ClusterEnvironment]) -> None: + if hasattr(cls, "master_address") and callable(cls.master_address): + rank_zero_deprecation( + f"`{cls.__name__}.master_address` has been deprecated in v1.6 and will be removed in 1.7." + " Implement the property `main_address` instead (do not forget to add the `@property` decorator)." + ) + if hasattr(cls, "master_port") and callable(cls.master_port): + rank_zero_deprecation( + f"`{cls.__name__}.master_port` has been deprecated in v1.6 and will be removed in 1.7." + " Implement the property `main_port` instead (do not forget to add the `@property` decorator)." + ) diff --git a/pytorch_lightning/plugins/environments/kubeflow_environment.py b/pytorch_lightning/plugins/environments/kubeflow_environment.py index c92c10dfad8bd..8f959dc627d6e 100644 --- a/pytorch_lightning/plugins/environments/kubeflow_environment.py +++ b/pytorch_lightning/plugins/environments/kubeflow_environment.py @@ -39,10 +39,12 @@ def is_using_kubeflow() -> bool: def creates_processes_externally(self) -> bool: return True - def master_address(self) -> str: + @property + def main_address(self) -> str: return os.environ["MASTER_ADDR"] - def master_port(self) -> int: + @property + def main_port(self) -> int: return int(os.environ["MASTER_PORT"]) def world_size(self) -> int: diff --git a/pytorch_lightning/plugins/environments/lightning_environment.py b/pytorch_lightning/plugins/environments/lightning_environment.py index 6919b211cb0ba..af3db7c5e9e88 100644 --- a/pytorch_lightning/plugins/environments/lightning_environment.py +++ b/pytorch_lightning/plugins/environments/lightning_environment.py @@ -29,14 +29,14 @@ class LightningEnvironment(ClusterEnvironment): 2. The user launches all processes manually or with utilities like :code:`torch.distributed.launch`. The appropriate environment variables need to be set, and at minimum :code:`LOCAL_RANK`. - If the master address and port are not provided, the default environment will choose them + If the main address and port are not provided, the default environment will choose them automatically. It is recommended to use this default environment for single-node distributed training as it provides a convenient way to launch the training script. """ def __init__(self): super().__init__() - self._master_port = None + self._main_port = None self._global_rank: int = 0 self._world_size: int = 1 @@ -49,13 +49,15 @@ def creates_processes_externally(self) -> bool: """ return "LOCAL_RANK" in os.environ - def master_address(self) -> str: + @property + def main_address(self) -> str: return os.environ.get("MASTER_ADDR", "127.0.0.1") - def master_port(self) -> int: - if self._master_port is None: - self._master_port = os.environ.get("MASTER_PORT", find_free_network_port()) - return int(self._master_port) + @property + def main_port(self) -> int: + if self._main_port is None: + self._main_port = os.environ.get("MASTER_PORT", find_free_network_port()) + return int(self._main_port) def world_size(self) -> int: return self._world_size @@ -85,7 +87,7 @@ def teardown(self) -> None: def find_free_network_port() -> int: """Finds a free port on localhost. - It is useful in single-node training when we don't want to connect to a real master node but have to set the + It is useful in single-node training when we don't want to connect to a real main node but have to set the `MASTER_PORT` environment variable. """ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) diff --git a/pytorch_lightning/plugins/environments/lsf_environment.py b/pytorch_lightning/plugins/environments/lsf_environment.py index 06563c7f017bb..3b67edd8b4091 100644 --- a/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/pytorch_lightning/plugins/environments/lsf_environment.py @@ -41,10 +41,10 @@ class LSFEnvironment(ClusterEnvironment): """ def __init__(self): - self._master_address = self._get_master_address() - self._master_port = self._get_master_port() - log.debug(f"MASTER_ADDR: {self._master_address}") - log.debug(f"MASTER_PORT: {self._master_port}") + self._main_address = self._get_main_address() + self._main_port = self._get_main_port() + log.debug(f"MASTER_ADDR: {self._main_address}") + log.debug(f"MASTER_PORT: {self._main_port}") @staticmethod def is_using_lsf() -> bool: @@ -56,13 +56,15 @@ def is_using_lsf() -> bool: def creates_processes_externally(self) -> bool: return True - def master_address(self): - """The master address is read from a list of hosts contained in the environment variable `LSB_HOSTS`.""" - return self._master_address + @property + def main_address(self) -> str: + """The main address is read from a list of hosts contained in the environment variable `LSB_HOSTS`.""" + return self._main_address - def master_port(self): - """THe master port gets calculated from the LSF job ID.""" - return self._master_port + @property + def main_port(self) -> int: + """The main port gets calculated from the LSF job ID.""" + return self._main_port def world_size(self): """The world size is read from the environment variable `JSM_NAMESPACE_SIZE`.""" @@ -127,17 +129,17 @@ def _read_hosts(): ) return hosts - def _get_master_address(self): + def _get_main_address(self) -> str: hosts = self._read_hosts() return hosts[1] @staticmethod - def _get_master_port(): - """A helper function for accessing the master port. + def _get_main_port() -> int: + """A helper function for accessing the main port. - Uses the LSF job ID so all ranks can compute the master port. + Uses the LSF job ID so all ranks can compute the main port. """ - # check for user-specified master port + # check for user-specified main port port = os.environ.get("MASTER_PORT") if not port: jobid = os.environ.get("LSB_JOBID") @@ -146,7 +148,7 @@ def _get_master_port(): port = int(jobid) # all ports should be in the 10k+ range port = int(port) % 1000 + 10000 - log.debug(f"calculated LSF master port: {port}") + log.debug(f"calculated LSF main port: {port}") else: - log.debug(f"using externally specified master port: {port}") + log.debug(f"using externally specified main port: {port}") return int(port) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index d599c513650c2..9f3f52fc1c381 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -28,7 +28,8 @@ class SLURMEnvironment(ClusterEnvironment): def creates_processes_externally(self) -> bool: return True - def master_address(self) -> str: + @property + def main_address(self) -> str: # figure out the root node addr slurm_nodelist = os.environ.get("SLURM_NODELIST") if slurm_nodelist: @@ -41,7 +42,8 @@ def master_address(self) -> str: log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") return root_node - def master_port(self) -> int: + @property + def main_port(self) -> int: # ----------------------- # SLURM JOB = PORT number # ----------------------- diff --git a/pytorch_lightning/plugins/environments/torchelastic_environment.py b/pytorch_lightning/plugins/environments/torchelastic_environment.py index 9e4b4a9482bfd..597a01ed206a5 100644 --- a/pytorch_lightning/plugins/environments/torchelastic_environment.py +++ b/pytorch_lightning/plugins/environments/torchelastic_environment.py @@ -35,15 +35,17 @@ def is_using_torchelastic() -> bool: def creates_processes_externally(self) -> bool: return True - def master_address(self) -> str: + @property + def main_address(self) -> str: if "MASTER_ADDR" not in os.environ: rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost") os.environ["MASTER_ADDR"] = "127.0.0.1" log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") - master_address = os.environ.get("MASTER_ADDR") - return master_address + main_address = os.environ.get("MASTER_ADDR") + return main_address - def master_port(self) -> int: + @property + def main_port(self) -> int: if "MASTER_PORT" not in os.environ: rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910") os.environ["MASTER_PORT"] = "12910" diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 53aff5128ff37..63d05741e845e 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -85,7 +85,7 @@ class DDPPlugin(ParallelPlugin): """Plugin for multi-process single-device training on one or multiple nodes. - The master process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of + The main process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes. """ @@ -188,8 +188,8 @@ def _call_children_scripts(self): self._check_can_spawn_children() # DDP Environment variables - os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 5e04ca95743eb..80fde80ec41ee 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -132,7 +132,7 @@ def _is_single_process_single_device(self): return True def setup(self) -> None: - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) # pass in a state q smp = mp.get_context("spawn") self.mp_queue = smp.SimpleQueue() @@ -178,7 +178,7 @@ def spawn(self, function: Callable, *args: Any, return_result: bool = True, **kw Return: The output of the function of process 0. """ - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) context = mp.get_context("spawn") return_queue = context.SimpleQueue() if return_result else None mp.spawn(self._wrapped_function, args=(function, args, kwargs, return_queue), nprocs=self.num_processes) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index a4be698dc6602..2464a8ba4eeca 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -376,13 +376,11 @@ def _init_deepspeed_distributed(self) -> None: f"GLOBAL_RANK: {self.global_rank}, " f"MEMBER: {self.global_rank + 1}/{self.world_size}" ) - deepspeed.init_distributed( - self.torch_distributed_backend, distributed_port=self.cluster_environment.master_port() - ) + deepspeed.init_distributed(self.torch_distributed_backend, distributed_port=self.cluster_environment.main_port) def _set_node_environment_variables(self) -> None: - os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) os.environ["RANK"] = str(self.global_rank) os.environ["WORLD_SIZE"] = str(self.world_size) os.environ["LOCAL_RANK"] = str(self.local_rank) diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 2509122bd99e2..7aa4a67721c04 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -326,7 +326,7 @@ def test_step_end(self, output: STEP_OUTPUT) -> STEP_OUTPUT: def _pod_progress_bar_force_stdout(self) -> None: # Why is it required? The way `pytorch_xla.distributed` streams logs - # from different vms to the master worker doesn't work well with tqdm + # from different vms to the main worker doesn't work well with tqdm # Ref: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/xla_dist.py#L140 # The print statement seems to force tqdm to flush stdout. if self.tpu_global_core_rank == 0 and int(os.getenv(xenv.TPUVM_MODE, 0)) == 1: diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index a1e55631c2a7d..c23edf594146f 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -180,7 +180,7 @@ def results(self) -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]: The result is cached instead of returned directly, because some plugins require transmitting the results from one multiprocessing context to another in a separate step. For example, the plugins that use the "spawn" - start-method send the result to the master process through a + start-method send the result to the main process through a `multiprocessing queue (shared memory) `_. """ return self._results diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 47fa7b791eae0..ef0f0b5ef3e97 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -162,7 +162,7 @@ def sync_ddp_if_available( def sync_ddp( result: torch.Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None ) -> torch.Tensor: - """Function to reduce the tensors from several ddp processes to one master process. + """Function to reduce the tensors from several ddp processes to one main process. Args: result: the value to sync and reduce (typically tensor or number) @@ -379,8 +379,8 @@ def init_dist_connection( """ global_rank = global_rank if global_rank is not None else cluster_environment.global_rank() world_size = world_size if world_size is not None else cluster_environment.world_size() - os.environ["MASTER_ADDR"] = cluster_environment.master_address() - os.environ["MASTER_PORT"] = str(cluster_environment.master_port()) + os.environ["MASTER_ADDR"] = cluster_environment.main_address + os.environ["MASTER_PORT"] = str(cluster_environment.main_port) if torch.distributed.is_available() and not torch.distributed.is_initialized(): log.info(f"initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch.distributed.init_process_group( diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 810f96bfdd08d..d95f5c8e6f9ea 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -368,7 +368,8 @@ def test_accelerator_choice_ddp_cpu_custom_cluster(_, tmpdir): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" class CustomCluster(LightningEnvironment): - def master_address(self): + @property + def main_address(self): return "asdf" @property diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 16c511b6effd9..ec44d9842ce2a 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -23,6 +23,12 @@ from pytorch_lightning.callbacks.progress import ProgressBar from pytorch_lightning.callbacks.xla_stats_monitor import XLAStatsMonitor from pytorch_lightning.loggers import LoggerCollection, TestTubeLogger +from pytorch_lightning.plugins.environments import ( + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, +) from tests.callbacks.test_callbacks import OldStatefulCallback from tests.deprecated_api import _soft_unimport_module from tests.helpers import BoringModel @@ -455,3 +461,43 @@ def test_v1_7_0_deprecate_lr_sch_names(tmpdir): with pytest.deprecated_call(match="`LearningRateMonitor.lr_sch_names` has been deprecated in v1.5"): assert lr_monitor.lr_sch_names == ["lr-SGD"] + + +@pytest.mark.parametrize( + "cls", + [ + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, + ], +) +def test_v1_7_0_cluster_environment_master_address(cls): + class MyClusterEnvironment(cls): + def master_address(self): + pass + + with pytest.deprecated_call( + match="MyClusterEnvironment.master_address` has been deprecated in v1.6 and will be removed in 1.7" + ): + MyClusterEnvironment() + + +@pytest.mark.parametrize( + "cls", + [ + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, + ], +) +def test_v1_7_0_cluster_environment_master_port(cls): + class MyClusterEnvironment(cls): + def master_port(self): + pass + + with pytest.deprecated_call( + match="MyClusterEnvironment.master_port` has been deprecated in v1.6 and will be removed in 1.7" + ): + MyClusterEnvironment() diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index b9c510ceb8437..23ada729505ab 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -278,7 +278,7 @@ def test_accelerator_set_when_using_tpu(tmpdir, tpu_cores): @RunIf(tpu=True) @pl_multi_process_test def test_broadcast_on_tpu(): - """Checks if an object from the master process is broadcasted to other processes correctly.""" + """Checks if an object from the main process is broadcasted to other processes correctly.""" def test_broadcast(rank): trainer = Trainer(tpu_cores=8) diff --git a/tests/plugins/environments/test_kubeflow_environment.py b/tests/plugins/environments/test_kubeflow_environment.py index 44fb66fca0d91..1e2efaf36698b 100644 --- a/tests/plugins/environments/test_kubeflow_environment.py +++ b/tests/plugins/environments/test_kubeflow_environment.py @@ -28,10 +28,10 @@ def test_default_attributes(): with pytest.raises(KeyError): # MASTER_ADDR is required - env.master_address() + env.main_address with pytest.raises(KeyError): # MASTER_PORT is required - env.master_port() + env.main_port with pytest.raises(KeyError): # WORLD_SIZE is required env.world_size() @@ -54,8 +54,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = KubeflowEnvironment() - assert env.master_address() == "1.2.3.4" - assert env.master_port() == 500 + assert env.main_address == "1.2.3.4" + assert env.main_port == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 0 diff --git a/tests/plugins/environments/test_lightning_environment.py b/tests/plugins/environments/test_lightning_environment.py index 2de3574a1805f..7848f4057027d 100644 --- a/tests/plugins/environments/test_lightning_environment.py +++ b/tests/plugins/environments/test_lightning_environment.py @@ -24,8 +24,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = LightningEnvironment() assert not env.creates_processes_externally - assert env.master_address() == "127.0.0.1" - assert isinstance(env.master_port(), int) + assert env.main_address == "127.0.0.1" + assert isinstance(env.main_port, int) assert env.world_size() == 1 assert env.local_rank() == 0 assert env.node_rank() == 0 @@ -35,8 +35,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(): """Test that the default cluster environment takes the attributes from the environment variables.""" env = LightningEnvironment() - assert env.master_address() == "1.2.3.4" - assert env.master_port() == 500 + assert env.main_address == "1.2.3.4" + assert env.main_port == 500 assert env.world_size() == 1 assert env.global_rank() == 0 assert env.local_rank() == 2 @@ -67,12 +67,12 @@ def test_node_rank_from_group_rank(): @mock.patch.dict(os.environ, {}) def test_random_master_port(): - """Test randomly chosen master port when no master port was given by user.""" + """Test randomly chosen main port when no main port was given by user.""" env = LightningEnvironment() - port = env.master_port() + port = env.main_port assert isinstance(port, int) # repeated calls do not generate a new port number - assert env.master_port() == port + assert env.main_port == port @mock.patch.dict(os.environ, {"WORLD_SIZE": "1"}) diff --git a/tests/plugins/environments/test_lsf_environment.py b/tests/plugins/environments/test_lsf_environment.py index e3a5a67ba4be2..f438b236d8d37 100644 --- a/tests/plugins/environments/test_lsf_environment.py +++ b/tests/plugins/environments/test_lsf_environment.py @@ -36,10 +36,10 @@ def test_missing_lsb_job_id(): @mock.patch.dict(os.environ, {"MASTER_PORT": "4321", "LSB_JOBID": "1234", "LSB_HOSTS": "batch 10.10.10.0 10.10.10.1"}) -def test_manual_master_port_and_address(): +def test_manual_main_port_and_address(): """Test a user can set the port manually through the MASTER_PORT env variable.""" env = LSFEnvironment() - assert env.master_port() == 4321 + assert env.main_port == 4321 @mock.patch.dict( @@ -56,8 +56,8 @@ def test_attributes_from_environment_variables(): """Test that the LSF environment takes the attributes from the environment variables.""" env = LSFEnvironment() assert env.creates_processes_externally - assert env.master_address() == "10.10.10.0" - assert env.master_port() == 10234 + assert env.main_address == "10.10.10.0" + assert env.main_port == 10234 assert env.world_size() == 4 assert env.global_rank() == 3 assert env.local_rank() == 1 diff --git a/tests/plugins/environments/test_slurm_environment.py b/tests/plugins/environments/test_slurm_environment.py index e2f1df4ecd9ad..5515c6bfc4986 100644 --- a/tests/plugins/environments/test_slurm_environment.py +++ b/tests/plugins/environments/test_slurm_environment.py @@ -25,8 +25,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = SLURMEnvironment() assert env.creates_processes_externally - assert env.master_address() == "127.0.0.1" - assert env.master_port() == 12910 + assert env.main_address == "127.0.0.1" + assert env.main_port == 12910 with pytest.raises(KeyError): # world size is required to be passed as env variable env.world_size() @@ -52,8 +52,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the SLURM cluster environment takes the attributes from the environment variables.""" env = SLURMEnvironment() - assert env.master_address() == "1.1.1.1" - assert env.master_port() == 15000 + 1234 + assert env.main_address == "1.1.1.1" + assert env.main_port == 15000 + 1234 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 @@ -77,7 +77,7 @@ def test_attributes_from_environment_variables(caplog): [("alpha,beta,gamma", "alpha"), ("alpha beta gamma", "alpha"), ("1.2.3.[100-110]", "1.2.3.100")], ) def test_master_address_from_slurm_node_list(slurm_node_list, expected): - """Test extracting the master node from different formats for the SLURM_NODELIST.""" + """Test extracting the main node from different formats for the SLURM_NODELIST.""" with mock.patch.dict(os.environ, {"SLURM_NODELIST": slurm_node_list}): env = SLURMEnvironment() - assert env.master_address() == expected + assert env.main_address == expected diff --git a/tests/plugins/environments/test_torchelastic_environment.py b/tests/plugins/environments/test_torchelastic_environment.py index d0df262c169cb..2eaf79f0a6d57 100644 --- a/tests/plugins/environments/test_torchelastic_environment.py +++ b/tests/plugins/environments/test_torchelastic_environment.py @@ -25,8 +25,8 @@ def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = TorchElasticEnvironment() assert env.creates_processes_externally - assert env.master_address() == "127.0.0.1" - assert env.master_port() == 12910 + assert env.main_address == "127.0.0.1" + assert env.main_port == 12910 assert env.world_size() is None with pytest.raises(KeyError): # local rank is required to be passed as env variable @@ -48,8 +48,8 @@ def test_default_attributes(): def test_attributes_from_environment_variables(caplog): """Test that the torchelastic cluster environment takes the attributes from the environment variables.""" env = TorchElasticEnvironment() - assert env.master_address() == "1.2.3.4" - assert env.master_port() == 500 + assert env.main_address == "1.2.3.4" + assert env.main_port == 500 assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index c2464ed92e8c0..25f02a4c1eab5 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -820,8 +820,8 @@ def test_deepspeed_plugin_env_variables(mock_deepspeed_distributed, tmpdir, plat # assert no env variables have been set within the DeepSpeedPlugin assert all(k not in os.environ for k in ("MASTER_PORT", "MASTER_ADDR", "RANK", "WORLD_SIZE", "LOCAL_RANK")) else: - assert os.environ["MASTER_ADDR"] == str(trainer.training_type_plugin.cluster_environment.master_address()) - assert os.environ["MASTER_PORT"] == str(trainer.training_type_plugin.cluster_environment.master_port()) + assert os.environ["MASTER_ADDR"] == str(trainer.training_type_plugin.cluster_environment.main_address) + assert os.environ["MASTER_PORT"] == str(trainer.training_type_plugin.cluster_environment.main_port) assert os.environ["RANK"] == str(trainer.training_type_plugin.global_rank) assert os.environ["WORLD_SIZE"] == str(trainer.training_type_plugin.world_size) assert os.environ["LOCAL_RANK"] == str(trainer.training_type_plugin.local_rank)