From e861927dcfb80d828a2808c53f68f6e0fd175cc4 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 19 Nov 2022 11:09:16 +0100 Subject: [PATCH 1/9] revert new hydra cwd behavior --- .../strategies/launchers/subprocess_script.py | 38 +++++-------------- .../strategies/launchers/subprocess_script.py | 5 ++- 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index 07af414e23bc3..4a20a9718dcef 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -15,7 +15,7 @@ import subprocess import sys from time import sleep -from typing import Any, Callable, Sequence +from typing import Any, Callable, Sequence, Optional, Tuple import numpy as np from lightning_utilities.core.imports import RequirementCache @@ -116,15 +116,16 @@ def _call_children_scripts(self) -> None: # start process # if hydra is available and initialized, make sure to set the cwd correctly hydra_in_use = False + cwd: Optional[str] = None if _HYDRA_AVAILABLE: from hydra.core.hydra_config import HydraConfig hydra_in_use = HydraConfig.initialized() if hydra_in_use: - command = _hydra_subprocess_cmd(local_rank=local_rank) + command, cwd = _hydra_subprocess_cmd(local_rank=local_rank) else: command = _basic_subprocess_cmd() - subprocess.Popen(command, env=env_copy) + subprocess.Popen(command, env=env_copy, cwd=cwd) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds @@ -149,10 +150,9 @@ def _basic_subprocess_cmd() -> Sequence[str]: return [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:] -def _hydra_subprocess_cmd(local_rank: int) -> Sequence[str]: +def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218 - from hydra.core.hydra_config import HydraConfig - from hydra.utils import to_absolute_path + from hydra.utils import to_absolute_path, get_original_cwd # when user is using hydra find the absolute path if __main__.__spec__ is None: # pragma: no-cover @@ -160,25 +160,7 @@ def _hydra_subprocess_cmd(local_rank: int) -> Sequence[str]: else: command = [sys.executable, "-m", __main__.__spec__.name] - # extract the hydra configuration - hydra_cfg = HydraConfig.get() - - # the location of the hydra configuration files saved for the current job - hydra_output = hydra_cfg.runtime.output_dir - if hydra_cfg.output_subdir is not None: - hydra_output = os.path.join(hydra_output, hydra_cfg.output_subdir) - - # check if experimental re-run capability exists - # otherwise use existing config.yaml which may have issues - pickled_config = os.path.join(hydra_output, "config.pickle") - if os.path.exists(pickled_config): - command += ["--experimental-rerun", pickled_config] - - else: - command += ["-cp", hydra_output, "-cn", "config.yaml"] - command += [ - f"hydra.output_subdir=.pl_ddp_hydra_{local_rank}", - f"hydra.run.dir={hydra_cfg.runtime.output_dir}", - ] - - return command + cwd = get_original_cwd() + os_cwd = f'"{os.getcwd()}"' + command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] + return command, cwd diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index b5f4d39973ede..a55d8d71cf7e8 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -111,17 +111,18 @@ def _call_children_scripts(self) -> None: del env_copy["PL_GLOBAL_SEED"] hydra_in_use = False + cwd: Optional[str] = None if _HYDRA_AVAILABLE: from hydra.core.hydra_config import HydraConfig hydra_in_use = HydraConfig.initialized() if hydra_in_use: - command = _hydra_subprocess_cmd(local_rank) + command, cwd = _hydra_subprocess_cmd(local_rank) else: command = _basic_subprocess_cmd() - subprocess.Popen(command, env=env_copy) + subprocess.Popen(command, env=env_copy, cwd=cwd) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds From 435e2bcfe2d8c2d8209e86490ad6135ffce9d282 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 19 Nov 2022 10:12:36 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning_lite/strategies/launchers/subprocess_script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index 4a20a9718dcef..a0391b0d68eed 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -15,7 +15,7 @@ import subprocess import sys from time import sleep -from typing import Any, Callable, Sequence, Optional, Tuple +from typing import Any, Callable, Optional, Sequence, Tuple import numpy as np from lightning_utilities.core.imports import RequirementCache @@ -152,7 +152,7 @@ def _basic_subprocess_cmd() -> Sequence[str]: def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218 - from hydra.utils import to_absolute_path, get_original_cwd + from hydra.utils import get_original_cwd, to_absolute_path # when user is using hydra find the absolute path if __main__.__spec__ is None: # pragma: no-cover From 48f8a1823e68c0bc7f20c747afa0de4dd88a3f12 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 19 Nov 2022 11:24:36 +0100 Subject: [PATCH 3/9] debug --- src/lightning_lite/strategies/launchers/subprocess_script.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index 4a20a9718dcef..d3fa29bcd1688 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -163,4 +163,5 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] + print(command) return command, cwd From 58c13c442eba0c2e377bbe64559fedce7e7ee741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 19 Nov 2022 11:27:00 +0100 Subject: [PATCH 4/9] debug --- src/pytorch_lightning/strategies/launchers/subprocess_script.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index a55d8d71cf7e8..b94b888930c4d 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -117,6 +117,7 @@ def _call_children_scripts(self) -> None: hydra_in_use = HydraConfig.initialized() + print("hydra in use", hydra_in_use) if hydra_in_use: command, cwd = _hydra_subprocess_cmd(local_rank) else: From a34ed542a700e075064a7a5e74f0029ee377d6e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 19 Nov 2022 11:30:22 +0100 Subject: [PATCH 5/9] debug --- src/lightning_lite/strategies/launchers/subprocess_script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index 436229392ac1a..b7044caa33267 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -160,6 +160,8 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: else: command = [sys.executable, "-m", __main__.__spec__.name] + command += sys.argv[1:] + cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] From 59a50826c36480394da5e92d89cc6b33b7378df3 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 19 Nov 2022 11:38:11 +0100 Subject: [PATCH 6/9] fix tests --- .../strategies/launchers/test_subprocess_script.py | 8 ++++---- .../strategies/launchers/test_subprocess_script.py | 13 ------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/tests/tests_lite/strategies/launchers/test_subprocess_script.py b/tests/tests_lite/strategies/launchers/test_subprocess_script.py index 1b2e4360e2432..b16bab30ecf4d 100644 --- a/tests/tests_lite/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_lite/strategies/launchers/test_subprocess_script.py @@ -84,7 +84,7 @@ def test_subprocess_script_launcher_launch_processes(popen_mock, _): @mock.patch("lightning_lite.strategies.launchers.subprocess_script.subprocess.Popen") def test_subprocess_script_launcher_hydra_in_use(popen_mock, _, monkeypatch): basic_command = Mock(return_value="basic_command") - hydra_command = Mock(return_value="hydra_command") + hydra_command = Mock(return_value=("hydra_command", "hydra_cwd")) monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_basic_subprocess_cmd", basic_command) monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_hydra_subprocess_cmd", hydra_command) @@ -101,7 +101,7 @@ def simulate_launch(): # when hydra not available monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_HYDRA_AVAILABLE", False) simulate_launch() - popen_mock.assert_called_with("basic_command", env=ANY) + popen_mock.assert_called_with("basic_command", env=ANY, cwd=None) popen_mock.reset_mock() import hydra @@ -112,7 +112,7 @@ def simulate_launch(): HydraConfigMock.initialized.return_value = False monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock) simulate_launch() - popen_mock.assert_called_with("basic_command", env=ANY) + popen_mock.assert_called_with("basic_command", env=ANY, cwd=None) popen_mock.reset_mock() # when hydra available and initialized @@ -121,5 +121,5 @@ def simulate_launch(): HydraConfigMock.initialized.return_value = True monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock) simulate_launch() - popen_mock.assert_called_with("hydra_command", env=ANY) + popen_mock.assert_called_with("hydra_command", env=ANY, cwd="hydra_cwd") popen_mock.reset_mock() diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index 83605f53873db..e9d88df208091 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -76,19 +76,6 @@ def test_ddp_with_hydra_runjob(cleandir, subdir): cmd += [f"hydra.output_subdir={subdir}"] run_process(cmd) - # Make sure config.yaml was created for additional - # processes. - logs = list(Path.cwd().glob("**/config.yaml")) - assert len(logs) == devices - - # Make sure the parameter was set and used - cfg = OmegaConf.load(logs[0]) - assert cfg.devices == devices - - # Make sure PL spawned a job that is logged by Hydra - logs = list(Path.cwd().glob("**/*.log")) - assert len(logs) == 1 - @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) From a319408bc13920cc9616d105e17f816d10795080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 19 Nov 2022 11:42:13 +0100 Subject: [PATCH 7/9] debug --- .../launchers/test_subprocess_script.py | 93 +------------------ 1 file changed, 3 insertions(+), 90 deletions(-) diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index e9d88df208091..5b495f13ca316 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -1,35 +1,17 @@ -import logging -import os import sys -from pathlib import Path import pytest from lightning_utilities.core.imports import RequirementCache -from pytorch_lightning.strategies.launchers.subprocess_script import _HYDRA_AVAILABLE from tests_pytorch.helpers.runif import RunIf _HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2") _HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7") -if _HYDRA_AVAILABLE: - from omegaconf import OmegaConf if _HYDRA_WITH_RUN_PROCESS: from hydra.test_utils.test_utils import run_process -# fixture to run hydra jobs in a clean temporary directory -# Hydra creates its own output directories and logs -@pytest.fixture -def cleandir(tmp_path): - """Run function in a temporary directory.""" - old_dir = os.getcwd() # get current working directory (cwd) - os.chdir(tmp_path) # change cwd to the temp-directory - yield tmp_path # yields control to the test to be run - os.chdir(old_dir) - logging.shutdown() - - # Script to run from command line script = """ import hydra @@ -64,7 +46,9 @@ def task_fn(cfg): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) @pytest.mark.parametrize("subdir", [None, "dksa", ".hello"]) -def test_ddp_with_hydra_runjob(cleandir, subdir): +def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): + monkeypatch.chdir(tmpdir) + # Save script locally with open("temp.py", "w") as fn: fn.write(script) @@ -75,74 +59,3 @@ def test_ddp_with_hydra_runjob(cleandir, subdir): if subdir is not None: cmd += [f"hydra.output_subdir={subdir}"] run_process(cmd) - - -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) -@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) -@pytest.mark.parametrize("num_jobs", [1, 2]) -def test_ddp_with_hydra_multirunjob(cleandir, num_jobs): - # Save script locally - with open("temp.py", "w") as fn: - fn.write(script) - - # create fake multirun params based on `num_jobs` - fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs)) - - # Run CLI - run_process([sys.executable, "temp.py", "+devices=2", '+strategy="ddp"', fake_param, "--multirun"]) - - # Make sure config.yaml was created for each job - configs = sorted(Path.cwd().glob("**/.pl_ddp_hydra_*/config.yaml")) - assert len(configs) == num_jobs - - # Make sure the parameter was set and used for each job - for i, config in enumerate(configs): - cfg = OmegaConf.load(config) - local_rank = int(config.parent.parent.parts[-1]) - assert cfg.devices == 2 - assert cfg.foo == local_rank - - logs = list(Path.cwd().glob("**/*.log")) - assert len(logs) == num_jobs - - -yaml_file = """ -hydra: - callbacks: - save_job_info: - _target_: hydra.experimental.callbacks.PickleJobInfoCallback -""" - - -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) -@pytest.mark.skipif(not _HYDRA_WITH_RERUN, reason=str(_HYDRA_WITH_RERUN)) -@pytest.mark.parametrize("num_jobs", [1, 2]) -def test_ddp_with_hydra_multirunjob_rerun(cleandir, num_jobs): - # Save script locally - with open("temp.py", "w") as fn: - fn.write(script) - - with open("config.yaml", "w") as fn: - fn.write(yaml_file) - - # create fake multirun params based on `num_jobs` - fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs)) - - # Run CLI - run_process( - [ - sys.executable, - "temp.py", - "-cp", - ".", - "-cn", - "config.yaml", - "+devices=2", - '+strategy="ddp"', - fake_param, - "--multirun", - ] - ) - - pickles = sorted(Path.cwd().glob("**/.hydra/config.pickle")) - assert len(pickles) == num_jobs From 4eefef6ae3e2d6d4efc5481fa22bfba7a9b2bbdb Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 19 Nov 2022 11:43:33 +0100 Subject: [PATCH 8/9] remove debug statements --- src/lightning_lite/strategies/launchers/subprocess_script.py | 1 - src/pytorch_lightning/strategies/launchers/subprocess_script.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index b7044caa33267..1fb8b9686bb41 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -165,5 +165,4 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] - print(command) return command, cwd diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index b94b888930c4d..a55d8d71cf7e8 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -117,7 +117,6 @@ def _call_children_scripts(self) -> None: hydra_in_use = HydraConfig.initialized() - print("hydra in use", hydra_in_use) if hydra_in_use: command, cwd = _hydra_subprocess_cmd(local_rank) else: From e5b83fd44af29b6351450fd0a56ed06c535d82fc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 19 Nov 2022 11:47:32 +0100 Subject: [PATCH 9/9] changelog --- src/pytorch_lightning/CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 976ea6d056d7f..8a2848e6a6aa2 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -50,6 +50,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated `pytorch_lightning.utilities.memory.get_gpu_memory_map` in favor of `pytorch_lightning.accelerators.cuda.get_nvidia_gpu_stats` ([#15617](https://github.com/Lightning-AI/lightning/pull/15617)) +- Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737)) + ### Fixed