From a4457f12208f3a2ddbf9bf5cd63762a0cacf8550 Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Mon, 24 Jul 2023 07:32:16 +0530 Subject: [PATCH 1/8] Use hydra.run.dir (not os.getcwd) for DDP subprocesses' run dir --- .../fabric/strategies/launchers/subprocess_script.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/strategies/launchers/subprocess_script.py b/src/lightning/fabric/strategies/launchers/subprocess_script.py index 62b3a718b90f9..a442531a497b0 100644 --- a/src/lightning/fabric/strategies/launchers/subprocess_script.py +++ b/src/lightning/fabric/strategies/launchers/subprocess_script.py @@ -143,6 +143,7 @@ def _basic_subprocess_cmd() -> Sequence[str]: def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218 + from hydra.core.hydra_config import HydraConfig from hydra.utils import get_original_cwd, to_absolute_path # when user is using hydra find the absolute path @@ -154,6 +155,6 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: command += sys.argv[1:] cwd = get_original_cwd() - os_cwd = f'"{os.getcwd()}"' - command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] + run_dir = f'"{HydraConfig.get().run.dir}"' + command += [f"hydra.run.dir={run_dir}", f"hydra.job.name=train_ddp_process_{local_rank}"] return command, cwd From 65caaa54d32cb7d2ede7115d07573c284c82d97f Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Mon, 24 Jul 2023 08:02:43 +0530 Subject: [PATCH 2/8] Update changelog --- src/lightning/fabric/CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 442abcfeec020..a6ecdf2366ed2 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -144,6 +144,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed issue where DDP subprocesses that used Hydra would set hydra's working directory to current directory ([#18145](https://github.com/Lightning-AI/lightning/pull/18145)) + + - Fixed issue where running on TPUs would select the wrong device index ([#17227](https://github.com/Lightning-AI/lightning/pull/17227)) From 44b1304a0f87a80e7671467a982f25c822c711f9 Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Fri, 28 Jul 2023 07:17:46 +0530 Subject: [PATCH 3/8] Add tests --- .../launchers/test_subprocess_script.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index 9f2fb371dc967..882669460097e 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -58,11 +58,25 @@ def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): # Run CLI devices = 2 - cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"'] + run_dir = Path(tmpdir) / "hydra_output" + cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.run.dir={run_dir}"] if subdir is not None: cmd += [f"hydra.output_subdir={subdir}"] run_process(cmd) + if subdir != "null": # There's no .hydra if subdir on command line is "null" + # Make sure config.yaml was created for additional processes. + logs = list(run_dir.glob("**/config.yaml")) + assert len(logs) == devices + + # Make sure the parameter was set and used + cfg = OmegaConf.load(logs[0]) + assert cfg.devices == devices + + # Make sure PL spawned a job that is logged by Hydra + logs = list(run_dir.glob("**/*.log")) + assert len(logs) == 1 + def test_kill(): launcher = _SubprocessScriptLauncher(Mock(), 1, 1) From 1f89360d4b1617af8f61b928606385d349cbef8e Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Fri, 28 Jul 2023 07:22:01 +0530 Subject: [PATCH 4/8] Parts of test file got left uncommitted by mistake. Commit --- .../strategies/launchers/test_subprocess_script.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index 882669460097e..99e843f173c3f 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -1,5 +1,6 @@ import subprocess import sys +from pathlib import Path from unittest.mock import Mock import pytest @@ -13,6 +14,7 @@ if _HYDRA_WITH_RUN_PROCESS: from hydra.test_utils.test_utils import run_process + from omegaconf import OmegaConf # Script to run from command line @@ -48,7 +50,7 @@ def task_fn(cfg): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) -@pytest.mark.parametrize("subdir", [None, "dksa", ".hello"]) +@pytest.mark.parametrize("subdir", [None, "null", "dksa", ".hello"]) def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): monkeypatch.chdir(tmpdir) From 8e0f5fa2c5931880aa4026cdfabeda8e14464cc9 Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Fri, 28 Jul 2023 07:25:41 +0530 Subject: [PATCH 5/8] Create separate subdirs for different subprocesses --- src/lightning/fabric/strategies/launchers/subprocess_script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lightning/fabric/strategies/launchers/subprocess_script.py b/src/lightning/fabric/strategies/launchers/subprocess_script.py index a442531a497b0..fc161bae2ecb1 100644 --- a/src/lightning/fabric/strategies/launchers/subprocess_script.py +++ b/src/lightning/fabric/strategies/launchers/subprocess_script.py @@ -157,4 +157,6 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: cwd = get_original_cwd() run_dir = f'"{HydraConfig.get().run.dir}"' command += [f"hydra.run.dir={run_dir}", f"hydra.job.name=train_ddp_process_{local_rank}"] + subdir = "null" if HydraConfig.get().output_subdir is None else f".pl_ddp_hydra_{local_rank}" + command += [f"hydra.output_subdir={subdir}"] return command, cwd From 120813a08cd08535882bf81733f0806169dc90ee Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Fri, 28 Jul 2023 07:33:44 +0530 Subject: [PATCH 6/8] Improve tests --- .../launchers/test_subprocess_script.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index 99e843f173c3f..50c38762a1d16 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -66,18 +66,17 @@ def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): cmd += [f"hydra.output_subdir={subdir}"] run_process(cmd) - if subdir != "null": # There's no .hydra if subdir on command line is "null" - # Make sure config.yaml was created for additional processes. - logs = list(run_dir.glob("**/config.yaml")) - assert len(logs) == devices + # Make sure config.yaml was created for additional processes iff subdir is present. + saved_confs = list(run_dir.glob("**/config.yaml")) + assert len(saved_confs) == (0 if subdir == "null" else devices) - # Make sure the parameter was set and used - cfg = OmegaConf.load(logs[0]) + if saved_confs: # Make sure the parameter was set and used + cfg = OmegaConf.load(saved_confs[0]) assert cfg.devices == devices - # Make sure PL spawned a job that is logged by Hydra - logs = list(run_dir.glob("**/*.log")) - assert len(logs) == 1 + # Make sure PL spawned jobs that are logged by Hydra + logs = list(run_dir.glob("**/*.log")) + assert len(logs) == devices def test_kill(): From c1f30445ebc8b0f7def84c4a6762e4ba554906b7 Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Sat, 29 Jul 2023 02:37:14 +0530 Subject: [PATCH 7/8] Replace `tmpdir` with `tmp_path` in test --- .../strategies/launchers/test_subprocess_script.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index 50c38762a1d16..dd4922dc89997 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -1,6 +1,5 @@ import subprocess import sys -from pathlib import Path from unittest.mock import Mock import pytest @@ -51,8 +50,8 @@ def task_fn(cfg): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) @pytest.mark.parametrize("subdir", [None, "null", "dksa", ".hello"]) -def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): - monkeypatch.chdir(tmpdir) +def test_ddp_with_hydra_runjob(subdir, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) # Save script locally with open("temp.py", "w") as fn: @@ -60,7 +59,7 @@ def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): # Run CLI devices = 2 - run_dir = Path(tmpdir) / "hydra_output" + run_dir = tmp_path / "hydra_output" cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.run.dir={run_dir}"] if subdir is not None: cmd += [f"hydra.output_subdir={subdir}"] From bd44aa4bb16db9b84388bca74af2fece36668fc8 Mon Sep 17 00:00:00 2001 From: Nisheeth Lahoti Date: Sun, 30 Jul 2023 13:52:48 +0530 Subject: [PATCH 8/8] Set subdir=null in subprocesses --- .../fabric/strategies/launchers/subprocess_script.py | 7 +++---- .../strategies/launchers/test_subprocess_script.py | 5 ++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/lightning/fabric/strategies/launchers/subprocess_script.py b/src/lightning/fabric/strategies/launchers/subprocess_script.py index fc161bae2ecb1..fe3815c9588d0 100644 --- a/src/lightning/fabric/strategies/launchers/subprocess_script.py +++ b/src/lightning/fabric/strategies/launchers/subprocess_script.py @@ -155,8 +155,7 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: command += sys.argv[1:] cwd = get_original_cwd() - run_dir = f'"{HydraConfig.get().run.dir}"' - command += [f"hydra.run.dir={run_dir}", f"hydra.job.name=train_ddp_process_{local_rank}"] - subdir = "null" if HydraConfig.get().output_subdir is None else f".pl_ddp_hydra_{local_rank}" - command += [f"hydra.output_subdir={subdir}"] + rundir = f'"{HydraConfig.get().run.dir}"' + # Set output_subdir null since we don't want different subprocesses trying to write to config.yaml + command += [f"hydra.run.dir={rundir}", f"hydra.job.name=train_ddp_process_{local_rank}", "hydra.output_subdir=null"] return command, cwd diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index dd4922dc89997..8d7d31d212c46 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -8,7 +8,6 @@ from lightning.pytorch.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from tests_pytorch.helpers.runif import RunIf -_HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2") _HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7") if _HYDRA_WITH_RUN_PROCESS: @@ -65,9 +64,9 @@ def test_ddp_with_hydra_runjob(subdir, tmp_path, monkeypatch): cmd += [f"hydra.output_subdir={subdir}"] run_process(cmd) - # Make sure config.yaml was created for additional processes iff subdir is present. + # Make sure no config.yaml was created for additional processes saved_confs = list(run_dir.glob("**/config.yaml")) - assert len(saved_confs) == (0 if subdir == "null" else devices) + assert len(saved_confs) == (0 if subdir == "null" else 1) # Main process has config.yaml iff subdir!="null" if saved_confs: # Make sure the parameter was set and used cfg = OmegaConf.load(saved_confs[0])