From 05422ca1ae5deeeb714b073866b4cc1a4952bb29 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 10 Mar 2021 11:56:04 +0000 Subject: [PATCH 1/3] Ensure we set the default device before initializing deepspeed --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index a481c0c2e206b..654e74e566630 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -231,6 +231,8 @@ def _init_scheduler_optimizer(self): return optimizer, scheduler, optimizer_frequencies def _initialize_deepspeed_train(self, model): + if self.root_device.type == "cuda": + torch.cuda.set_device(self.root_device) optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info( From 01adbd4746e1c41d3f2ddb9b4f9ed1e6deab7a77 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 10 Mar 2021 11:59:57 +0000 Subject: [PATCH 2/3] Add CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f73292f79342a..6e06e76269a9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380)) +- Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/PyTorchLightning/pytorch-lightning/pull/6460)) + + ## [1.2.3] - 2021-03-09 ### Fixed From a0dd6ee961fa9b5fb0051754e3898fdcca08836c Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Wed, 10 Mar 2021 12:09:12 +0000 Subject: [PATCH 3/3] Update pytorch_lightning/plugins/training_type/deepspeed.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 654e74e566630..b54155d60eae5 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -231,7 +231,7 @@ def _init_scheduler_optimizer(self): return optimizer, scheduler, optimizer_frequencies def _initialize_deepspeed_train(self, model): - if self.root_device.type == "cuda": + if self.on_gpu: torch.cuda.set_device(self.root_device) optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: