diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index b3fbf0d17fd4c..a6598c1f5f62e 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -147,6 +147,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed issue where DDP subprocesses that used Hydra would set hydra's working directory to current directory ([#18145](https://github.com/Lightning-AI/lightning/pull/18145)) + + - Fixed issue where running on TPUs would select the wrong device index ([#17227](https://github.com/Lightning-AI/lightning/pull/17227)) diff --git a/src/lightning/fabric/strategies/launchers/subprocess_script.py b/src/lightning/fabric/strategies/launchers/subprocess_script.py index 62b3a718b90f9..fe3815c9588d0 100644 --- a/src/lightning/fabric/strategies/launchers/subprocess_script.py +++ b/src/lightning/fabric/strategies/launchers/subprocess_script.py @@ -143,6 +143,7 @@ def _basic_subprocess_cmd() -> Sequence[str]: def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218 + from hydra.core.hydra_config import HydraConfig from hydra.utils import get_original_cwd, to_absolute_path # when user is using hydra find the absolute path @@ -154,6 +155,7 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: command += sys.argv[1:] cwd = get_original_cwd() - os_cwd = f'"{os.getcwd()}"' - command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] + rundir = f'"{HydraConfig.get().run.dir}"' + # Set output_subdir null since we don't want different subprocesses trying to write to config.yaml + command += [f"hydra.run.dir={rundir}", f"hydra.job.name=train_ddp_process_{local_rank}", "hydra.output_subdir=null"] return command, cwd diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index 9f2fb371dc967..8d7d31d212c46 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -8,11 +8,11 @@ from lightning.pytorch.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from tests_pytorch.helpers.runif import RunIf -_HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2") _HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7") if _HYDRA_WITH_RUN_PROCESS: from hydra.test_utils.test_utils import run_process + from omegaconf import OmegaConf # Script to run from command line @@ -48,9 +48,9 @@ def task_fn(cfg): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) -@pytest.mark.parametrize("subdir", [None, "dksa", ".hello"]) -def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): - monkeypatch.chdir(tmpdir) +@pytest.mark.parametrize("subdir", [None, "null", "dksa", ".hello"]) +def test_ddp_with_hydra_runjob(subdir, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) # Save script locally with open("temp.py", "w") as fn: @@ -58,11 +58,24 @@ def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): # Run CLI devices = 2 - cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"'] + run_dir = tmp_path / "hydra_output" + cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.run.dir={run_dir}"] if subdir is not None: cmd += [f"hydra.output_subdir={subdir}"] run_process(cmd) + # Make sure no config.yaml was created for additional processes + saved_confs = list(run_dir.glob("**/config.yaml")) + assert len(saved_confs) == (0 if subdir == "null" else 1) # Main process has config.yaml iff subdir!="null" + + if saved_confs: # Make sure the parameter was set and used + cfg = OmegaConf.load(saved_confs[0]) + assert cfg.devices == devices + + # Make sure PL spawned jobs that are logged by Hydra + logs = list(run_dir.glob("**/*.log")) + assert len(logs) == devices + def test_kill(): launcher = _SubprocessScriptLauncher(Mock(), 1, 1)