1515import subprocess
1616import sys
1717from time import sleep
18- from typing import Any , Callable , Optional
18+ from typing import Any , Callable , Sequence
1919
2020import __main__
2121import numpy as np
2424from lightning_lite .plugins .environments .cluster_environment import ClusterEnvironment
2525from lightning_lite .strategies .launchers .base import _Launcher
2626
27- _HYDRA_AVAILABLE = RequirementCache ("hydra" )
27+ _HYDRA_AVAILABLE = RequirementCache ("hydra-core " )
2828
2929
3030class _SubprocessScriptLauncher (_Launcher ):
@@ -104,32 +104,6 @@ def _call_children_scripts(self) -> None:
104104 # allow the user to pass the node rank
105105 os .environ ["NODE_RANK" ] = str (self .cluster_environment .node_rank ())
106106 os .environ ["LOCAL_RANK" ] = str (self .cluster_environment .local_rank ())
107-
108- # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c`
109- # See https://docs.python.org/3/reference/import.html#main-spec
110- if __main__ .__spec__ is None : # pragma: no-cover
111- # Script called as `python a/b/c.py`
112- if _HYDRA_AVAILABLE :
113- # when user is using hydra find the absolute path
114- from hydra .utils import to_absolute_path
115-
116- to_abs_path = to_absolute_path
117- else :
118- to_abs_path = os .path .abspath
119-
120- # pull out the commands used to run the script and resolve the absolute file path
121- command = sys .argv
122- try :
123- full_path = to_abs_path (command [0 ])
124- except Exception :
125- full_path = os .path .abspath (command [0 ])
126-
127- command [0 ] = full_path
128- # use the same python interpreter and actually running
129- command = [sys .executable ] + command
130- else : # Script called as `python -m a.b.c`
131- command = [sys .executable , "-m" , __main__ .__spec__ .name ] + sys .argv [1 :]
132-
133107 os .environ ["WORLD_SIZE" ] = f"{ self .num_processes * self .num_nodes } "
134108
135109 for local_rank in range (1 , self .num_processes ):
@@ -142,16 +116,16 @@ def _call_children_scripts(self) -> None:
142116
143117 # start process
144118 # if hydra is available and initialized, make sure to set the cwd correctly
145- cwd : Optional [ str ] = None
119+ hydra_in_use = False
146120 if _HYDRA_AVAILABLE :
147121 from hydra .core .hydra_config import HydraConfig
148- from hydra .utils import get_original_cwd
149122
150- if HydraConfig .initialized ():
151- cwd = get_original_cwd ()
152- os_cwd = f'"{ os .getcwd ()} "'
153- command += [f"hydra.run.dir={ os_cwd } " , f"hydra.job.name=train_ddp_process_{ local_rank } " ]
154- subprocess .Popen (command , env = env_copy , cwd = cwd )
123+ hydra_in_use = HydraConfig .initialized ()
124+ if hydra_in_use :
125+ command = _hydra_subprocess_cmd (local_rank = local_rank )
126+ else :
127+ command = _basic_subprocess_cmd ()
128+ subprocess .Popen (command , env = env_copy )
155129
156130 # starting all processes at once can cause issues
157131 # with dataloaders delay between 1-10 seconds
@@ -165,3 +139,44 @@ def _check_can_spawn_children(self) -> None:
165139 " Possible reasons: 1) LOCAL_RANK environment variable was incorrectly modified by the user,"
166140 " 2) `ClusterEnvironment.creates_processes_externally` incorrectly implemented."
167141 )
142+
143+
144+ def _basic_subprocess_cmd () -> Sequence [str ]:
145+ if __main__ .__spec__ is None : # pragma: no-cover
146+ return [sys .executable , os .path .abspath (sys .argv [0 ])] + sys .argv [1 :]
147+ else :
148+ return [sys .executable , "-m" , __main__ .__spec__ .name ] + sys .argv [1 :]
149+
150+
151+ def _hydra_subprocess_cmd (local_rank : int ) -> Sequence [str ]:
152+ from hydra .core .hydra_config import HydraConfig
153+ from hydra .utils import to_absolute_path
154+
155+ # when user is using hydra find the absolute path
156+ if __main__ .__spec__ is None : # pragma: no-cover
157+ command = [sys .executable , to_absolute_path (sys .argv [0 ])]
158+ else :
159+ command = [sys .executable , "-m" , __main__ .__spec__ .name ]
160+
161+ # extract the hydra configuration
162+ hydra_cfg = HydraConfig .get ()
163+
164+ # the location of the hydra configuration files saved for the current job
165+ hydra_output = hydra_cfg .runtime .output_dir
166+ if hydra_cfg .output_subdir is not None :
167+ hydra_output = os .path .join (hydra_output , hydra_cfg .output_subdir )
168+
169+ # check if experimental re-run capability exists
170+ # otherwise use existing config.yaml which may have issues
171+ pickled_config = os .path .join (hydra_output , "config.pickle" )
172+ if os .path .exists (pickled_config ):
173+ command += ["--experimental-rerun" , pickled_config ]
174+
175+ else :
176+ command += ["-cp" , hydra_output , "-cn" , "config.yaml" ]
177+ command += [
178+ f"hydra.output_subdir=.pl_ddp_hydra_{ local_rank } " ,
179+ f"hydra.run.dir={ hydra_cfg .runtime .output_dir } " ,
180+ ]
181+
182+ return command
0 commit comments