From 1ce342431212a80aedde27eeee9493b1bae0d6a9 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 21 Dec 2021 16:06:38 +0100 Subject: [PATCH 1/8] Fix CLI race condition saving the config --- pytorch_lightning/utilities/cli.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index f25920a40bf83..ad7d8eabc4ad7 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -404,21 +404,25 @@ def __init__( def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: # save the config in `setup` because (1) we want it to save regardless of the trainer function run # and we want to save before processes are spawned - log_dir = trainer.log_dir + log_dir = trainer.log_dir # this broadcasts the directory assert log_dir is not None + fs = get_filesystem(log_dir) config_path = os.path.join(log_dir, self.config_filename) - if not self.overwrite and os.path.isfile(config_path): - raise RuntimeError( - f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting" - " results of a previous run. You can delete the previous config file," - " set `LightningCLI(save_config_callback=None)` to disable config saving," - " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file." - ) + if not self.overwrite: + if fs.isfile(config_path): + raise RuntimeError( + f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting" + " results of a previous run. You can delete the previous config file," + " set `LightningCLI(save_config_callback=None)` to disable config saving," + " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file." + ) + # make sure all processes finished `isfile` before letting rank 0 write + trainer.training_type_plugin.barrier() if trainer.is_global_zero: # save only on rank zero to avoid race conditions on DDP. # the `log_dir` needs to be created as we rely on the logger to do it usually # but it hasn't logged anything at this point - get_filesystem(log_dir).makedirs(log_dir, exist_ok=True) + fs.makedirs(log_dir, exist_ok=True) self.parser.save( self.config, config_path, skip_none=False, overwrite=self.overwrite, multifile=self.multifile ) From 8799905b5f8a3cd5bdb1cda38e88ba81f146ade4 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 21 Dec 2021 16:09:42 +0100 Subject: [PATCH 2/8] Update CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f3d262e7ac82..1dd6b3d7909fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -340,7 +340,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - -- +- Fixed `LightningCLI` race condition saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) ## [1.5.6] - 2021-12-15 From c595b51255c6ea052cfe90136f425adf8e6053d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Dec 2021 03:25:38 +0000 Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f725c61851ee..7b707953a1500 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -335,7 +335,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed double evaluation bug with fault-tolerance enabled where the second call was completely skipped ([#11119](https://github.com/PyTorchLightning/pytorch-lightning/pull/11119)) - + - Fixed `LightningCLI` race condition saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) From c284a095951c5ddf3cee7a60621ed60e4bf4e0ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 22 Dec 2021 05:16:05 +0100 Subject: [PATCH 4/8] Update pytorch_lightning/utilities/cli.py --- pytorch_lightning/utilities/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index ad7d8eabc4ad7..52de7579b7d3d 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -417,7 +417,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file." ) # make sure all processes finished `isfile` before letting rank 0 write - trainer.training_type_plugin.barrier() + trainer.strategy.barrier() if trainer.is_global_zero: # save only on rank zero to avoid race conditions on DDP. # the `log_dir` needs to be created as we rely on the logger to do it usually From 11a03f4b885e1cdb67cdeec1e60237edd5ad2ad3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Dec 2021 04:16:55 +0000 Subject: [PATCH 5/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfba8f6313f9d..e42663ee0c300 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -344,7 +344,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `LightningCLI` race condition saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) - + - Fixed an issue with the `TPUSpawnPlugin` handling the `XLA_USE_BF16` environment variable incorrectly ([#10990](https://github.com/PyTorchLightning/pytorch-lightning/pull/10990)) From e8668363aa9241c973be4cee340c291a9cecb59a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 22 Dec 2021 17:40:48 +0100 Subject: [PATCH 6/8] Update CHANGELOG.md Co-authored-by: Roger Shieh --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e42663ee0c300..880a6a20d63e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -342,7 +342,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed double evaluation bug with fault-tolerance enabled where the second call was completely skipped ([#11119](https://github.com/PyTorchLightning/pytorch-lightning/pull/11119)) -- Fixed `LightningCLI` race condition saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) +- Fixed `LightningCLI` race condition while saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) - Fixed an issue with the `TPUSpawnPlugin` handling the `XLA_USE_BF16` environment variable incorrectly ([#10990](https://github.com/PyTorchLightning/pytorch-lightning/pull/10990)) From c8a93f887473507d6eef473967b676fcfeea2cda Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 22 Dec 2021 18:00:48 +0100 Subject: [PATCH 7/8] Broadcast isfile --- pytorch_lightning/utilities/cli.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index 52de7579b7d3d..454faa6d3a88c 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -406,19 +406,25 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st # and we want to save before processes are spawned log_dir = trainer.log_dir # this broadcasts the directory assert log_dir is not None - fs = get_filesystem(log_dir) + is_global_zero = trainer.is_global_zero config_path = os.path.join(log_dir, self.config_filename) + fs = get_filesystem(log_dir) + if not self.overwrite: - if fs.isfile(config_path): + # check if the file exists on rank 0 + file_exists = fs.isfile(config_path) if is_global_zero else False + # broadcast whether to fail to all ranks + file_exists = trainer.strategy.broadcast(file_exists) + if file_exists: raise RuntimeError( f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting" " results of a previous run. You can delete the previous config file," " set `LightningCLI(save_config_callback=None)` to disable config saving," " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file." ) - # make sure all processes finished `isfile` before letting rank 0 write - trainer.strategy.barrier() - if trainer.is_global_zero: + + # save the file on rank 0 + if is_global_zero: # save only on rank zero to avoid race conditions on DDP. # the `log_dir` needs to be created as we rely on the logger to do it usually # but it hasn't logged anything at this point From 55a0d99124d70c7bf4e78a5242929c1e63ba42de Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 23 Dec 2021 17:08:13 +0100 Subject: [PATCH 8/8] is_global_zero --- pytorch_lightning/utilities/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index 454faa6d3a88c..51fb22a301035 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -406,13 +406,12 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st # and we want to save before processes are spawned log_dir = trainer.log_dir # this broadcasts the directory assert log_dir is not None - is_global_zero = trainer.is_global_zero config_path = os.path.join(log_dir, self.config_filename) fs = get_filesystem(log_dir) if not self.overwrite: # check if the file exists on rank 0 - file_exists = fs.isfile(config_path) if is_global_zero else False + file_exists = fs.isfile(config_path) if trainer.is_global_zero else False # broadcast whether to fail to all ranks file_exists = trainer.strategy.broadcast(file_exists) if file_exists: @@ -424,7 +423,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st ) # save the file on rank 0 - if is_global_zero: + if trainer.is_global_zero: # save only on rank zero to avoid race conditions on DDP. # the `log_dir` needs to be created as we rely on the logger to do it usually # but it hasn't logged anything at this point