From 2e716842fc7f280997b7102d4f4223157a100549 Mon Sep 17 00:00:00 2001 From: Konstantin Dobler Date: Mon, 10 Jan 2022 22:50:55 +0100 Subject: [PATCH 1/6] Fix rank environment variable priority --- pytorch_lightning/utilities/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 7e817b14c4325..884bcd157268c 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -55,7 +55,7 @@ def wrapped_fn(*args: Any, **kwargs: Any) -> Optional[Any]: # TODO: this should be part of the cluster environment def _get_rank() -> int: - rank_keys = ("RANK", "SLURM_PROCID", "LOCAL_RANK") + rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID") for key in rank_keys: rank = os.environ.get(key) if rank is not None: From 8d259cfc06c5efae287744f06c2dbd98ccf8c73c Mon Sep 17 00:00:00 2001 From: Konstantin Dobler Date: Wed, 12 Jan 2022 14:00:20 +0100 Subject: [PATCH 2/6] Add comment explaining rank_keys order --- pytorch_lightning/utilities/distributed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 884bcd157268c..4bed6fd2e0944 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -55,6 +55,7 @@ def wrapped_fn(*args: Any, **kwargs: Any) -> Optional[Any]: # TODO: this should be part of the cluster environment def _get_rank() -> int: + # SLURM_PROCID can be set even if Slurm is not managing the multiprocessing, therefore LOCAL_RANK needs to be checked first rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID") for key in rank_keys: rank = os.environ.get(key) From d66448635fae6aa320d1bacc2d9b00aca3bd193e Mon Sep 17 00:00:00 2001 From: Konstantin Dobler Date: Fri, 4 Feb 2022 00:48:28 +0100 Subject: [PATCH 3/6] Add changelog entry --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 110b313392250..cd15dd795204c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -526,6 +526,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to avoid validation loop run on restart ([#11552](https://github.com/PyTorchLightning/pytorch-lightning/pull/11552)) + +- Fixed environment variable priority for global rank determination ([#11406](https://github.com/PyTorchLightning/pytorch-lightning/pull/11406)) + ## [1.5.9] - 2022-01-20 From 9838561ac02747a60c8311423910f8dd23c2d605 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 3 Feb 2022 23:49:56 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd15dd795204c..5aa54afa5ee14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -526,7 +526,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to avoid validation loop run on restart ([#11552](https://github.com/PyTorchLightning/pytorch-lightning/pull/11552)) - + - Fixed environment variable priority for global rank determination ([#11406](https://github.com/PyTorchLightning/pytorch-lightning/pull/11406)) From 2cef74cae0da8828b8eb5e6a817657052602058e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 10 Feb 2022 22:08:28 +0100 Subject: [PATCH 5/6] fix merge --- pytorch_lightning/utilities/rank_zero.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/rank_zero.py b/pytorch_lightning/utilities/rank_zero.py index df1e6792085c0..513798ff7aae7 100644 --- a/pytorch_lightning/utilities/rank_zero.py +++ b/pytorch_lightning/utilities/rank_zero.py @@ -37,7 +37,9 @@ def wrapped_fn(*args: Any, **kwargs: Any) -> Optional[Any]: # TODO: this should be part of the cluster environment def _get_rank() -> int: - rank_keys = ("RANK", "SLURM_PROCID", "LOCAL_RANK") + # SLURM_PROCID can be set even if SLURM is not managing the multiprocessing, + # therefore LOCAL_RANK needs to be checked first + rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID") for key in rank_keys: rank = os.environ.get(key) if rank is not None: From d79ab4b60eeb21b5c8a44bde36531f93ed0da29a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 10 Feb 2022 22:17:10 +0100 Subject: [PATCH 6/6] add test --- tests/utilities/rank_zero.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/utilities/rank_zero.py b/tests/utilities/rank_zero.py index 61bcf61c0ca59..15a55fdd877f2 100644 --- a/tests/utilities/rank_zero.py +++ b/tests/utilities/rank_zero.py @@ -53,3 +53,19 @@ def foo(): x = foo() assert x is None + + +@pytest.mark.parametrize( + "environ,expected_rank", + [ + ({"SLURM_PROCID": "2"}, 2), + ({"SLURM_PROCID": "2", "LOCAL_RANK": "1"}, 1), + ({"SLURM_PROCID": "2", "LOCAL_RANK": "1", "RANK": "0"}, 0), + ], +) +def test_rank_zero_priority(environ, expected_rank): + """Test the priority in which the rank gets determined when multiple environment variables are available.""" + with mock.patch.dict(os.environ, environ): + from pytorch_lightning.utilities.rank_zero import _get_rank + + assert _get_rank() == expected_rank