From d94517785bd8c6968acbf86bc97ab9ff96caabf7 Mon Sep 17 00:00:00 2001 From: Zhaofeng Wu Date: Wed, 10 Jun 2020 13:16:31 -0700 Subject: [PATCH] Increment port when taken --- pytorch_lightning/core/lightning.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index af7527f550d0e..593aba47b56e3 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -924,7 +924,8 @@ def init_ddp_connection( self, global_rank: int, world_size: int, - is_slurm_managing_tasks: bool = True + is_slurm_managing_tasks: bool = True, + retries: int = 20 ) -> None: """ Override to define your custom way of setting up a distributed environment. @@ -957,7 +958,16 @@ def init_ddp_connection( torch_backend = "nccl" if self.trainer.on_gpu else "gloo" log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") - torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + while True: + try: + torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + break + except RuntimeError: + # port is taken; we increment the port and try again + if retries <= 0: + raise + retries -= 1 + os.environ['MASTER_PORT'] = str(int(os.environ['MASTER_PORT']) + 1) def configure_apex( self,