Skip to content

Commit cd91c67

Browse files
committed
Increment port when taken
1 parent bd49b07 commit cd91c67

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

pytorch_lightning/core/lightning.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -924,7 +924,8 @@ def init_ddp_connection(
924924
self,
925925
proc_rank: int,
926926
world_size: int,
927-
is_slurm_managing_tasks: bool = True
927+
is_slurm_managing_tasks: bool = True,
928+
retries: int = 20
928929
) -> None:
929930
"""
930931
Override to define your custom way of setting up a distributed environment.
@@ -957,7 +958,16 @@ def init_ddp_connection(
957958

958959
torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
959960
log.info(f"initializing ddp: LOCAL_RANK: {proc_rank}/{world_size - 1} WORLD_SIZE:{world_size}")
960-
torch_distrib.init_process_group(torch_backend, rank=proc_rank, world_size=world_size)
961+
while True:
962+
try:
963+
torch_distrib.init_process_group(torch_backend, rank=proc_rank, world_size=world_size)
964+
break
965+
except RuntimeError:
966+
# port is taken; we increment the port and try again
967+
if retries <= 0:
968+
raise
969+
retries -= 1
970+
os.environ['MASTER_PORT'] = str(int(os.environ['MASTER_PORT']) + 1)
961971

962972
def configure_apex(
963973
self,

0 commit comments

Comments
 (0)