From d1002b588c4f7d35a20fe4a8f7da6605d6362cff Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Thu, 18 Nov 2021 21:41:11 +0100 Subject: [PATCH 1/8] refactor slurm_job_id --- .../plugins/environments/slurm_environment.py | 16 ++++++++++++++++ .../logger_connector/logger_connector.py | 4 +++- pytorch_lightning/trainer/trainer.py | 15 --------------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index 4e7070be6b2f1..694885b4e4ee4 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -17,6 +17,7 @@ import re from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from typing import Optional log = logging.getLogger(__name__) @@ -42,6 +43,21 @@ def detect() -> bool: """Returns ``True`` if the current process was launched on a SLURM cluster.""" return "SLURM_NTASKS" in os.environ + @staticmethod + def job_id() -> Optional[int]: + job_id = os.environ.get("SLURM_JOB_ID") + if job_id: + try: + job_id = int(job_id) + except ValueError: + job_id = None + + # in interactive mode, don't make logs use the same job id + in_slurm_interactive_mode = os.environ.get("SLURM_JOB_NAME") == "bash" + if in_slurm_interactive_mode: + job_id = None + return job_id + @property def main_address(self) -> str: # figure out the root node addr diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 640fc667705a8..8f8c2822be536 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -20,6 +20,7 @@ from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection, TensorBoardLogger from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT from pytorch_lightning.trainer.states import RunningStage, TrainerFn +from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.utilities import DeviceType, memory from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars @@ -29,6 +30,7 @@ class LoggerConnector: def __init__(self, trainer: "pl.Trainer", log_gpu_memory: Optional[str] = None) -> None: self.trainer = trainer + self.env = SLURMEnvironment() if log_gpu_memory is not None: rank_zero_deprecation( "Setting `log_gpu_memory` with the trainer flag is deprecated in v1.5 and will be removed in v1.7. " @@ -81,7 +83,7 @@ def configure_logger(self, logger: Union[bool, LightningLoggerBase, Iterable[Lig # default logger self.trainer.logger = ( TensorBoardLogger( - save_dir=self.trainer.default_root_dir, version=self.trainer.slurm_job_id, name="lightning_logs" + save_dir=self.trainer.default_root_dir, version=self.env.job_id, name="lightning_logs" ) if logger else None diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 26fbcb4362c73..b3590f1c32c44 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1728,21 +1728,6 @@ def use_amp(self) -> bool: def is_global_zero(self) -> bool: return self.global_rank == 0 - @property - def slurm_job_id(self) -> Optional[int]: - job_id = os.environ.get("SLURM_JOB_ID") - if job_id: - try: - job_id = int(job_id) - except ValueError: - job_id = None - - # in interactive mode, don't make logs use the same job id - in_slurm_interactive_mode = os.environ.get("SLURM_JOB_NAME") == "bash" - if in_slurm_interactive_mode: - job_id = None - return job_id - @property def lightning_optimizers(self) -> List[LightningOptimizer]: if self._lightning_optimizers is None: From 23ba48b6acd0dfbdaa984049d24995380c7846ef Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Thu, 18 Nov 2021 22:03:05 +0100 Subject: [PATCH 2/8] fix mypy error --- .../trainer/connectors/logger_connector/logger_connector.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 8f8c2822be536..43accfa2c479e 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -20,7 +20,6 @@ from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection, TensorBoardLogger from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT from pytorch_lightning.trainer.states import RunningStage, TrainerFn -from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.utilities import DeviceType, memory from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars @@ -30,7 +29,6 @@ class LoggerConnector: def __init__(self, trainer: "pl.Trainer", log_gpu_memory: Optional[str] = None) -> None: self.trainer = trainer - self.env = SLURMEnvironment() if log_gpu_memory is not None: rank_zero_deprecation( "Setting `log_gpu_memory` with the trainer flag is deprecated in v1.5 and will be removed in v1.7. " @@ -83,7 +81,7 @@ def configure_logger(self, logger: Union[bool, LightningLoggerBase, Iterable[Lig # default logger self.trainer.logger = ( TensorBoardLogger( - save_dir=self.trainer.default_root_dir, version=self.env.job_id, name="lightning_logs" + save_dir=self.trainer.default_root_dir, version=self.SLURMEnvironment.job_id, name="lightning_logs" ) if logger else None From e05bcfde3b2632e1b5d11fe4ad907660b53ab3f7 Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Sun, 21 Nov 2021 19:59:29 +0100 Subject: [PATCH 3/8] deprecate slurm_job_id --- .../plugins/environments/slurm_environment.py | 5 ----- .../connectors/logger_connector/logger_connector.py | 3 ++- pytorch_lightning/trainer/trainer.py | 9 +++++++++ tests/deprecated_api/test_remove_1-7.py | 8 ++++++++ 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index ddd767d8fa402..ce2f2170b66b4 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -38,11 +38,6 @@ def __init__(self, auto_requeue: bool = True) -> None: def creates_processes_externally(self) -> bool: return True - @staticmethod - def detect() -> bool: - """Returns ``True`` if the current process was launched on a SLURM cluster.""" - return "SLURM_NTASKS" in os.environ - @staticmethod def job_id() -> Optional[int]: job_id = os.environ.get("SLURM_JOB_ID") diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 64fa1cc35b860..252da01769306 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -20,6 +20,7 @@ from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection, TensorBoardLogger from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT from pytorch_lightning.trainer.states import RunningStage, TrainerFn +from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment from pytorch_lightning.utilities import DeviceType, memory from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars @@ -81,7 +82,7 @@ def configure_logger(self, logger: Union[bool, LightningLoggerBase, Iterable[Lig # default logger self.trainer.logger = ( TensorBoardLogger( - save_dir=self.trainer.default_root_dir, version=self.SLURMEnvironment.job_id, name="lightning_logs" + save_dir=self.trainer.default_root_dir, version=SLURMEnvironment.job_id(), name="lightning_logs" ) if logger else None diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 9b881e43d5293..7c1500ea0ac01 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -39,6 +39,7 @@ from pytorch_lightning.loops.dataloader.evaluation_loop import EvaluationLoop from pytorch_lightning.loops.fit_loop import FitLoop from pytorch_lightning.plugins import DDPSpawnPlugin, ParallelPlugin, PLUGIN_INPUT, PrecisionPlugin, TrainingTypePlugin +from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment from pytorch_lightning.profiler import ( AdvancedProfiler, BaseProfiler, @@ -1728,6 +1729,14 @@ def use_amp(self) -> bool: def is_global_zero(self) -> bool: return self.global_rank == 0 + @property + def slurm_job_id(self) -> Optional[int]: + rank_zero_deprecation( + "Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0." + ) + job_id = SLURMEnvironment.job_id() + return job_id + @property def lightning_optimizers(self) -> List[LightningOptimizer]: if self._lightning_optimizers is None: diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index c8016961963ba..1f81017c13799 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -378,6 +378,14 @@ def test_v1_7_0_trainer_log_gpu_memory(tmpdir): _ = Trainer(log_gpu_memory="min_max") +def test_v1_7_0_deprecated_slurm_job_id(): + trainer = Trainer() + with pytest.deprecated_call( + match="Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0." + ): + _ = trainer.slurm_job_id() + + @RunIf(min_gpus=1) def test_v1_7_0_deprecate_gpu_stats_monitor(tmpdir): with pytest.deprecated_call(match="The `GPUStatsMonitor` callback was deprecated in v1.5"): From d2ee8af95c62e20013cefda7b733538b9971224f Mon Sep 17 00:00:00 2001 From: AndresAlgaba Date: Sun, 21 Nov 2021 20:36:35 +0100 Subject: [PATCH 4/8] fix property test --- tests/deprecated_api/test_remove_1-7.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 1f81017c13799..c7ac56b286daf 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -383,7 +383,7 @@ def test_v1_7_0_deprecated_slurm_job_id(): with pytest.deprecated_call( match="Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0." ): - _ = trainer.slurm_job_id() + trainer.slurm_job_id @RunIf(min_gpus=1) From 94fc1d5c9b62ba9918bfa3a2c6ae44661433f100 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 21 Nov 2021 19:39:38 +0000 Subject: [PATCH 5/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/plugins/environments/slurm_environment.py | 2 +- .../trainer/connectors/logger_connector/logger_connector.py | 2 +- pytorch_lightning/trainer/trainer.py | 4 +--- tests/deprecated_api/test_remove_1-7.py | 4 +--- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py index ce2f2170b66b4..ad657e1e19564 100644 --- a/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/pytorch_lightning/plugins/environments/slurm_environment.py @@ -15,9 +15,9 @@ import logging import os import re +from typing import Optional from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment -from typing import Optional log = logging.getLogger(__name__) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 252da01769306..b98f13138b36f 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -18,9 +18,9 @@ import pytorch_lightning as pl from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection, TensorBoardLogger +from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT from pytorch_lightning.trainer.states import RunningStage, TrainerFn -from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment from pytorch_lightning.utilities import DeviceType, memory from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7c1500ea0ac01..c65b9a3bebb37 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1731,9 +1731,7 @@ def is_global_zero(self) -> bool: @property def slurm_job_id(self) -> Optional[int]: - rank_zero_deprecation( - "Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0." - ) + rank_zero_deprecation("Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0.") job_id = SLURMEnvironment.job_id() return job_id diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index c7ac56b286daf..9c0c12f981f4b 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -380,9 +380,7 @@ def test_v1_7_0_trainer_log_gpu_memory(tmpdir): def test_v1_7_0_deprecated_slurm_job_id(): trainer = Trainer() - with pytest.deprecated_call( - match="Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0." - ): + with pytest.deprecated_call(match="Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0."): trainer.slurm_job_id From 3c0a114fbf34ab7cb0f08984166d624207c52143 Mon Sep 17 00:00:00 2001 From: Andres Algaba Date: Mon, 22 Nov 2021 14:20:27 +0100 Subject: [PATCH 6/8] one line suggestion Co-authored-by: thomas chaton --- pytorch_lightning/trainer/trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c65b9a3bebb37..fdb7581df0997 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1732,8 +1732,7 @@ def is_global_zero(self) -> bool: @property def slurm_job_id(self) -> Optional[int]: rank_zero_deprecation("Method `slurm_job_id` is deprecated in v1.6.0 and will be removed in v1.7.0.") - job_id = SLURMEnvironment.job_id() - return job_id + return SLURMEnvironment.job_id() @property def lightning_optimizers(self) -> List[LightningOptimizer]: From 0f84890cc1ce7d9092a38f373691d34167a35853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 22 Nov 2021 17:32:52 +0100 Subject: [PATCH 7/8] add changelog entry --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 96830878d84ae..55bf11d53bd31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,7 +62,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the `precision_plugin` constructor argument from `Accelerator` ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) -- +- Deprecated the property `Trainer.slurm_job_id` in favor of the new `SLURMEnvironment.slurm_job_id()` method ([#10622](https://github.com/PyTorchLightning/pytorch-lightning/pull/10622)) - From e3a0a7bf193b921221e744a5d9817f4c6e2de41f Mon Sep 17 00:00:00 2001 From: Andres Algaba Date: Mon, 22 Nov 2021 17:41:20 +0100 Subject: [PATCH 8/8] Update CHANGELOG.md Co-authored-by: ananthsub --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55bf11d53bd31..b5190bd8b56d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,7 +62,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the `precision_plugin` constructor argument from `Accelerator` ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) -- Deprecated the property `Trainer.slurm_job_id` in favor of the new `SLURMEnvironment.slurm_job_id()` method ([#10622](https://github.com/PyTorchLightning/pytorch-lightning/pull/10622)) +- Deprecated the property `Trainer.slurm_job_id` in favor of the new `SLURMEnvironment.job_id()` method ([#10622](https://github.com/PyTorchLightning/pytorch-lightning/pull/10622)) -