File tree Expand file tree Collapse file tree 1 file changed +12
-2
lines changed Expand file tree Collapse file tree 1 file changed +12
-2
lines changed Original file line number Diff line number Diff line change @@ -924,7 +924,8 @@ def init_ddp_connection(
924924 self ,
925925 proc_rank : int ,
926926 world_size : int ,
927- is_slurm_managing_tasks : bool = True
927+ is_slurm_managing_tasks : bool = True ,
928+ retries : int = 20
928929 ) -> None :
929930 """
930931 Override to define your custom way of setting up a distributed environment.
@@ -957,7 +958,16 @@ def init_ddp_connection(
957958
958959 torch_backend = "nccl" if self .trainer .on_gpu else "gloo"
959960 log .info (f"initializing ddp: LOCAL_RANK: { proc_rank } /{ world_size - 1 } WORLD_SIZE:{ world_size } " )
960- torch_distrib .init_process_group (torch_backend , rank = proc_rank , world_size = world_size )
961+ while True :
962+ try :
963+ torch_distrib .init_process_group (torch_backend , rank = proc_rank , world_size = world_size )
964+ break
965+ except RuntimeError :
966+ # port is taken; we increment the port and try again
967+ if retries <= 0 :
968+ raise
969+ retries -= 1
970+ os .environ ['MASTER_PORT' ] = str (int (os .environ ['MASTER_PORT' ]) + 1 )
961971
962972 def configure_apex (
963973 self ,
You can’t perform that action at this time.
0 commit comments