From 9c9d8558b28e1ceb09dcdffdb17d0238c672178f Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Fri, 17 Sep 2021 12:03:36 -0700 Subject: [PATCH 01/11] pt1 dir empty check --- .../callbacks/model_checkpoint.py | 20 +++++++------------ pytorch_lightning/plugins/io/torch_plugin.py | 5 ++++- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 42cd078d2179d..7fcfd9c453733 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -37,7 +37,7 @@ from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.types import _METRIC, STEP_OUTPUT +from pytorch_lightning.utilities.types import _METRIC, _PATH, STEP_OUTPUT from pytorch_lightning.utilities.warnings import WarningCache log = logging.getLogger(__name__) @@ -203,7 +203,7 @@ class ModelCheckpoint(Callback): def __init__( self, - dirpath: Optional[Union[str, Path]] = None, + dirpath: Optional[_PATH] = None, filename: Optional[str] = None, monitor: Optional[str] = None, verbose: bool = False, @@ -265,7 +265,7 @@ def on_init_end(self, trainer: "pl.Trainer") -> None: self._save_on_train_epoch_end = trainer.val_check_interval == 1.0 and trainer.check_val_every_n_epoch == 1 def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - """When pretrain routine starts we build the ckpt dir on the fly.""" + """When pretrain routine starts we determine the ckpt dir on the fly.""" self.__resolve_ckpt_dir(trainer) def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: @@ -440,11 +440,8 @@ def __validate_init_configuration(self) -> None: " will duplicate the last checkpoint saved." ) - def __init_ckpt_dir(self, dirpath: Optional[Union[str, Path]], filename: Optional[str]) -> None: - self._fs = get_filesystem(str(dirpath) if dirpath else "") - - if self.save_top_k != 0 and dirpath is not None and self._fs.isdir(dirpath) and len(self._fs.ls(dirpath)) > 0: - rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.") + def __init_ckpt_dir(self, dirpath: Optional[_PATH], filename: Optional[str]) -> None: + self._fs = get_filesystem(dirpath if dirpath else "") if dirpath and self._fs.protocol == "file": dirpath = os.path.realpath(dirpath) @@ -616,9 +613,6 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: self.dirpath = ckpt_path - if not trainer.fast_dev_run and trainer.should_rank_save_checkpoint: - self._fs.makedirs(self.dirpath, exist_ok=True) - def _validate_monitor_key(self, trainer: "pl.Trainer") -> None: metrics = trainer.callback_metrics @@ -735,7 +729,7 @@ def _update_best_and_save( if del_filepath is not None and filepath != del_filepath: trainer.training_type_plugin.remove_checkpoint(del_filepath) - def to_yaml(self, filepath: Optional[Union[str, Path]] = None) -> None: + def to_yaml(self, filepath: Optional[_PATH] = None) -> None: """Saves the `best_k_models` dict containing the checkpoint paths with the corresponding scores to a YAML file.""" best_k = {k: v.item() for k, v in self.best_k_models.items()} @@ -744,7 +738,7 @@ def to_yaml(self, filepath: Optional[Union[str, Path]] = None) -> None: with self._fs.open(filepath, "w") as fp: yaml.dump(best_k, fp) - def file_exists(self, filepath: Union[str, Path], trainer: "pl.Trainer") -> bool: + def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool: """Checks if a file exists on rank 0 and broadcasts the result to all other ranks, preventing the internal state to diverge between ranks.""" exists = self._fs.exists(filepath) diff --git a/pytorch_lightning/plugins/io/torch_plugin.py b/pytorch_lightning/plugins/io/torch_plugin.py index 4413afc5d4166..804b48b86fae1 100644 --- a/pytorch_lightning/plugins/io/torch_plugin.py +++ b/pytorch_lightning/plugins/io/torch_plugin.py @@ -31,7 +31,10 @@ class TorchCheckpointIO(CheckpointIO): def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None: fs = get_filesystem(path) - fs.makedirs(os.path.dirname(path), exist_ok=True) + dirname = os.path.dirname(path) + if fs.isdir(dirname) and len(fs.ls(dirname)) > 0: + rank_zero_warn(f"Checkpoint directory {dirname} exists and is not empty.") + fs.makedirs(dirname, exist_ok=True) try: # write the checkpoint dictionary on the file atomic_save(checkpoint, path) From 2586a9a0a699bbb82b996c94f03f0c83dc48c93b Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Mon, 20 Sep 2021 13:12:26 -0700 Subject: [PATCH 02/11] clean imports --- pytorch_lightning/callbacks/model_checkpoint.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 7fcfd9c453733..73023b7d5a7ea 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -24,8 +24,7 @@ import time from copy import deepcopy from datetime import timedelta -from pathlib import Path -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional from weakref import proxy import numpy as np From a94540e9e0fe1beb6b76b9c18c5944b0f1515627 Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Mon, 20 Sep 2021 16:49:05 -0700 Subject: [PATCH 03/11] bring back resolve mkdir? --- pytorch_lightning/callbacks/model_checkpoint.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 73023b7d5a7ea..8cc59191d1ea7 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -612,6 +612,9 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: self.dirpath = ckpt_path + if not trainer.fast_dev_run and trainer.should_rank_save_checkpoint: + self._fs.makedirs(self.dirpath, exist_ok=True) + def _validate_monitor_key(self, trainer: "pl.Trainer") -> None: metrics = trainer.callback_metrics From 480fd821df88a932a9a0005d91b33083d828695a Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Tue, 21 Sep 2021 11:24:22 -0700 Subject: [PATCH 04/11] original doc --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 8cc59191d1ea7..2b555d4953e1f 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -264,7 +264,7 @@ def on_init_end(self, trainer: "pl.Trainer") -> None: self._save_on_train_epoch_end = trainer.val_check_interval == 1.0 and trainer.check_val_every_n_epoch == 1 def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - """When pretrain routine starts we determine the ckpt dir on the fly.""" + """When pretrain routine starts we build the ckpt dir on the fly.""" self.__resolve_ckpt_dir(trainer) def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: From 960e8f7be52970da6907933c748f661060ce2127 Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Fri, 24 Sep 2021 15:57:17 -0700 Subject: [PATCH 05/11] warningcache --- pytorch_lightning/plugins/io/torch_plugin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/io/torch_plugin.py b/pytorch_lightning/plugins/io/torch_plugin.py index 804b48b86fae1..df3f808ee7d67 100644 --- a/pytorch_lightning/plugins/io/torch_plugin.py +++ b/pytorch_lightning/plugins/io/torch_plugin.py @@ -21,9 +21,10 @@ from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.types import _PATH +from pytorch_lightning.utilities.warnings import WarningCache log = logging.getLogger(__name__) - +warning_cache = WarningCache() class TorchCheckpointIO(CheckpointIO): """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints @@ -33,7 +34,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio fs = get_filesystem(path) dirname = os.path.dirname(path) if fs.isdir(dirname) and len(fs.ls(dirname)) > 0: - rank_zero_warn(f"Checkpoint directory {dirname} exists and is not empty.") + warning_cache.warn(f"Checkpoint directory {dirname} exists and is not empty.") fs.makedirs(dirname, exist_ok=True) try: # write the checkpoint dictionary on the file From 3e4eb97caf5a454adab5902ffd4db652ce2b5e76 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Sep 2021 22:58:33 +0000 Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/plugins/io/torch_plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/plugins/io/torch_plugin.py b/pytorch_lightning/plugins/io/torch_plugin.py index df3f808ee7d67..78ba24bbef1ef 100644 --- a/pytorch_lightning/plugins/io/torch_plugin.py +++ b/pytorch_lightning/plugins/io/torch_plugin.py @@ -26,6 +26,7 @@ log = logging.getLogger(__name__) warning_cache = WarningCache() + class TorchCheckpointIO(CheckpointIO): """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively, common for most use cases.""" From ba47e36b6a851706a7f0c66799b31f1ab8ad605e Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Fri, 24 Sep 2021 17:04:49 -0700 Subject: [PATCH 07/11] cp callback after resolve --- pytorch_lightning/callbacks/model_checkpoint.py | 5 +++++ pytorch_lightning/plugins/io/torch_plugin.py | 7 +------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 2b555d4953e1f..2210e585ca967 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -266,6 +266,7 @@ def on_init_end(self, trainer: "pl.Trainer") -> None: def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: """When pretrain routine starts we build the ckpt dir on the fly.""" self.__resolve_ckpt_dir(trainer) + self.__check_if_dir_not_empty(self.dirpath, trainer) def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self._last_time_checked = time.monotonic() @@ -615,6 +616,10 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: if not trainer.fast_dev_run and trainer.should_rank_save_checkpoint: self._fs.makedirs(self.dirpath, exist_ok=True) + def __check_if_dir_not_empty(self, dirpath: _PATH, trainer: "pl.Trainer") -> None: + if trainer.is_global_zero and self.save_top_k != 0 and self._fs.isdir(dirpath) and len(self._fs.ls(dirpath)) > 0: + rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.") + def _validate_monitor_key(self, trainer: "pl.Trainer") -> None: metrics = trainer.callback_metrics diff --git a/pytorch_lightning/plugins/io/torch_plugin.py b/pytorch_lightning/plugins/io/torch_plugin.py index 78ba24bbef1ef..4413afc5d4166 100644 --- a/pytorch_lightning/plugins/io/torch_plugin.py +++ b/pytorch_lightning/plugins/io/torch_plugin.py @@ -21,10 +21,8 @@ from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.types import _PATH -from pytorch_lightning.utilities.warnings import WarningCache log = logging.getLogger(__name__) -warning_cache = WarningCache() class TorchCheckpointIO(CheckpointIO): @@ -33,10 +31,7 @@ class TorchCheckpointIO(CheckpointIO): def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None: fs = get_filesystem(path) - dirname = os.path.dirname(path) - if fs.isdir(dirname) and len(fs.ls(dirname)) > 0: - warning_cache.warn(f"Checkpoint directory {dirname} exists and is not empty.") - fs.makedirs(dirname, exist_ok=True) + fs.makedirs(os.path.dirname(path), exist_ok=True) try: # write the checkpoint dictionary on the file atomic_save(checkpoint, path) From b9a48d2d2b69df328385dd7f18df00acb403d9fa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 25 Sep 2021 00:06:06 +0000 Subject: [PATCH 08/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/callbacks/model_checkpoint.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 2210e585ca967..f5e0dd78deb3f 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -617,7 +617,12 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: self._fs.makedirs(self.dirpath, exist_ok=True) def __check_if_dir_not_empty(self, dirpath: _PATH, trainer: "pl.Trainer") -> None: - if trainer.is_global_zero and self.save_top_k != 0 and self._fs.isdir(dirpath) and len(self._fs.ls(dirpath)) > 0: + if ( + trainer.is_global_zero + and self.save_top_k != 0 + and self._fs.isdir(dirpath) + and len(self._fs.ls(dirpath)) > 0 + ): rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.") def _validate_monitor_key(self, trainer: "pl.Trainer") -> None: From 25ef398f889b61aa2e590f671ed67e1db6f60016 Mon Sep 17 00:00:00 2001 From: jjenniferdai <89552168+jjenniferdai@users.noreply.github.com> Date: Fri, 24 Sep 2021 17:50:42 -0700 Subject: [PATCH 09/11] move global_zero check outside warn fn Co-authored-by: ananthsub --- pytorch_lightning/callbacks/model_checkpoint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index f5e0dd78deb3f..ae2343387f658 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -266,7 +266,8 @@ def on_init_end(self, trainer: "pl.Trainer") -> None: def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: """When pretrain routine starts we build the ckpt dir on the fly.""" self.__resolve_ckpt_dir(trainer) - self.__check_if_dir_not_empty(self.dirpath, trainer) + if trainer.is_global_zero: + self.__warn_if_dir_not_empty(self.dirpath) def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self._last_time_checked = time.monotonic() From 35e239fd475c0089b18eb24aacc153af5b9752a6 Mon Sep 17 00:00:00 2001 From: jjenniferdai <89552168+jjenniferdai@users.noreply.github.com> Date: Fri, 24 Sep 2021 17:50:59 -0700 Subject: [PATCH 10/11] move global_zero check outside warn fn 2 Co-authored-by: ananthsub --- pytorch_lightning/callbacks/model_checkpoint.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index ae2343387f658..1b8e9d7c466ac 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -617,10 +617,9 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: if not trainer.fast_dev_run and trainer.should_rank_save_checkpoint: self._fs.makedirs(self.dirpath, exist_ok=True) - def __check_if_dir_not_empty(self, dirpath: _PATH, trainer: "pl.Trainer") -> None: + def __warn_if_dir_not_empty(self, dirpath: _PATH) -> None: if ( - trainer.is_global_zero - and self.save_top_k != 0 + self.save_top_k != 0 and self._fs.isdir(dirpath) and len(self._fs.ls(dirpath)) > 0 ): From 9353dd10bbc03f17e9cdbdea26c6d03b49a4b4e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 25 Sep 2021 00:52:49 +0000 Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/callbacks/model_checkpoint.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 1b8e9d7c466ac..cff2116446b92 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -618,11 +618,7 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: self._fs.makedirs(self.dirpath, exist_ok=True) def __warn_if_dir_not_empty(self, dirpath: _PATH) -> None: - if ( - self.save_top_k != 0 - and self._fs.isdir(dirpath) - and len(self._fs.ls(dirpath)) > 0 - ): + if self.save_top_k != 0 and self._fs.isdir(dirpath) and len(self._fs.ls(dirpath)) > 0: rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.") def _validate_monitor_key(self, trainer: "pl.Trainer") -> None: