From 783b24b2728c2e4f80a38d1676305767accd0d86 Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Tue, 23 Nov 2021 11:55:31 +0100 Subject: [PATCH 1/6] add job_name --- .../plugins/environments/slurm_environment.py | 34 +++++++++++-------- .../connectors/accelerator_connector.py | 2 +- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index ad657e1e19564..6b098773b6199 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -38,21 +38,6 @@ def __init__(self, auto_requeue: bool = True) -> None: def creates_processes_externally(self) -> bool: return True - @staticmethod - def job_id() -> Optional[int]: - job_id = os.environ.get("SLURM_JOB_ID") - if job_id: - try: - job_id = int(job_id) - except ValueError: - job_id = None - - # in interactive mode, don't make logs use the same job id - in_slurm_interactive_mode = os.environ.get("SLURM_JOB_NAME") == "bash" - if in_slurm_interactive_mode: - job_id = None - return job_id - @property def main_address(self) -> str: # figure out the root node addr @@ -100,6 +85,25 @@ def detect() -> bool: """Returns ``True`` if the current process was launched on a SLURM cluster.""" return "SLURM_NTASKS" in os.environ + @staticmethod + def job_name() -> Optional[str]: + return os.environ.get("SLURM_JOB_NAME") + + @staticmethod + def job_id() -> Optional[int]: + job_id = os.environ.get("SLURM_JOB_ID") + if job_id: + try: + job_id = int(job_id) + except ValueError: + job_id = None + + # in interactive mode, don't make logs use the same job id + in_slurm_interactive_mode = SLURMEnvironment.job_name() == "bash" + if in_slurm_interactive_mode: + job_id = None + return job_id + def world_size(self) -> int: return int(os.environ["SLURM_NTASKS"]) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7136437bbc69d..bc0dd4c2dc5e6 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -1009,7 +1009,7 @@ def _is_slurm_managing_tasks(self) -> bool: if ( (not self.use_ddp and not self.use_ddp2) or not SLURMEnvironment.detect() - or os.environ.get("SLURM_JOB_NAME") == "bash" # in interactive mode we don't manage tasks + or SLURMEnvironment.job_name() == "bash" # in interactive mode we don't manage tasks ): return False From 4c0fa73852a1c97e1498f79905604ac15b3117c8 Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Tue, 23 Nov 2021 19:22:55 +0100 Subject: [PATCH 2/6] add job_name tests --- tests/plugins/environments/test_slurm_environment.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/plugins/environments/test_slurm_environment.py b/tests/plugins/environments/test_slurm_environment.py index cce06e9c3984c..c15604b2fa8ea 100644 --- a/tests/plugins/environments/test_slurm_environment.py +++ b/tests/plugins/environments/test_slurm_environment.py @@ -27,6 +27,7 @@ def test_default_attributes(): assert env.creates_processes_externally assert env.main_address == "127.0.0.1" assert env.main_port == 12910 + assert env.job_name() is None with pytest.raises(KeyError): # world size is required to be passed as env variable env.world_size() @@ -47,6 +48,7 @@ def test_default_attributes(): "SLURM_LOCALID": "2", "SLURM_PROCID": "1", "SLURM_NODEID": "3", + "SLURM_JOB_NAME": "JOB", }, ) def test_attributes_from_environment_variables(caplog): @@ -59,6 +61,7 @@ def test_attributes_from_environment_variables(caplog): assert env.global_rank() == 1 assert env.local_rank() == 2 assert env.node_rank() == 3 + assert env.job_name() == "JOB" # setter should be no-op with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): env.set_global_rank(100) From acdd7f06a3729c2bbb78b1e1960936f25e5b8821 Mon Sep 17 00:00:00 2001 From: Andres Algaba Date: Tue, 30 Nov 2021 19:26:56 +0100 Subject: [PATCH 3/6] Update pytorch_lightning/plugins/environments/slurm_environment.py Co-authored-by: Jirka Borovec --- .../plugins/environments/slurm_environment.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index 6b098773b6199..d7b200749a58e 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -92,11 +92,10 @@ def job_name() -> Optional[str]: @staticmethod def job_id() -> Optional[int]: job_id = os.environ.get("SLURM_JOB_ID") - if job_id: - try: - job_id = int(job_id) - except ValueError: - job_id = None + try: + job_id = int(job_id) + except ValueError: + job_id = None # in interactive mode, don't make logs use the same job id in_slurm_interactive_mode = SLURMEnvironment.job_name() == "bash" From 5e0b9bd5707bf256ad3426cb4f3312b8a8534920 Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Tue, 30 Nov 2021 20:10:33 +0100 Subject: [PATCH 4/6] reverse if job_id --- .../plugins/environments/slurm_environment.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index d7b200749a58e..0ef3125fa0663 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -92,10 +92,12 @@ def job_name() -> Optional[str]: @staticmethod def job_id() -> Optional[int]: job_id = os.environ.get("SLURM_JOB_ID") - try: - job_id = int(job_id) - except ValueError: - job_id = None + + if job_id: + try: + job_id = int(job_id) + except ValueError: + job_id = None # in interactive mode, don't make logs use the same job id in_slurm_interactive_mode = SLURMEnvironment.job_name() == "bash" From af0c0d8507ba105522a0abe835e73f9973e9b6e4 Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Tue, 30 Nov 2021 21:24:02 +0100 Subject: [PATCH 5/6] change order --- .../plugins/environments/slurm_environment.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index 0ef3125fa0663..54cd7c1b84b0a 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -91,18 +91,22 @@ def job_name() -> Optional[str]: @staticmethod def job_id() -> Optional[int]: - job_id = os.environ.get("SLURM_JOB_ID") - - if job_id: - try: - job_id = int(job_id) - except ValueError: - job_id = None # in interactive mode, don't make logs use the same job id in_slurm_interactive_mode = SLURMEnvironment.job_name() == "bash" if in_slurm_interactive_mode: + return None + + job_id = os.environ.get("SLURM_JOB_ID") + + if job_id is None: + return None + + try: + job_id = int(job_id) + except ValueError: job_id = None + return job_id def world_size(self) -> int: From 4484a948e62b19ea8f45f858545ada437b69b123 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 1 Dec 2021 00:35:54 +0100 Subject: [PATCH 6/6] Update --- .../plugins/environments/slurm_environment.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index 54cd7c1b84b0a..bde236c672837 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -91,23 +91,18 @@ def job_name() -> Optional[str]: @staticmethod def job_id() -> Optional[int]: - # in interactive mode, don't make logs use the same job id in_slurm_interactive_mode = SLURMEnvironment.job_name() == "bash" if in_slurm_interactive_mode: return None job_id = os.environ.get("SLURM_JOB_ID") - if job_id is None: return None - try: - job_id = int(job_id) + return int(job_id) except ValueError: - job_id = None - - return job_id + return None def world_size(self) -> int: return int(os.environ["SLURM_NTASKS"])