From 7dc554290aea360229aa7dc5b71a78241bbbd27a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 25 May 2020 08:07:57 -0400 Subject: [PATCH 1/3] updated docs --- pytorch_lightning/trainer/trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 25ecd5435987e..eefbfe1a0d927 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -842,7 +842,10 @@ def fit( # route to appropriate start method # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: - task = int(os.environ['SLURM_LOCALID']) + if self.is_slurm_managing_tasks: + task = int(os.environ['SLURM_LOCALID']) + elif 'WORLD_SIZE' in os.environ and 'GROUP_RANK' in os.environ: + task = int(os.environ['LOCAL_RANK']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: From ccbf62d4cd915504404de526e16cf604994a8505 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 26 May 2020 13:21:02 -0400 Subject: [PATCH 2/3] added mixed --- pytorch_lightning/trainer/trainer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index eefbfe1a0d927..7fc7aa2f6fe08 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -844,17 +844,19 @@ def fit( if self.use_ddp2: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) - elif 'WORLD_SIZE' in os.environ and 'GROUP_RANK' in os.environ: + elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ): task = int(os.environ['LOCAL_RANK']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) - # torchelastic - elif 'WORLD_SIZE' in os.environ and 'GROUP_RANK' in os.environ: + + # torchelastic or general non_slurm ddp + elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ): task = int(os.environ['LOCAL_RANK']) self.ddp_train(task, model) + else: self.__set_random_port() # track for predict From 75b3b022248e4269431e80cef311649d752b8723 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 26 May 2020 13:21:22 -0400 Subject: [PATCH 3/3] added mixed --- pytorch_lightning/trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7fc7aa2f6fe08..9e4f59eeb1ba4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -844,6 +844,8 @@ def fit( if self.use_ddp2: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) + + # torchelastic or general non_slurm ddp2 elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ): task = int(os.environ['LOCAL_RANK']) self.ddp_train(task, model)