From 230a49787ba1aa7fe9e541d22319066be5dc71aa Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Mon, 10 May 2021 01:49:30 +0300 Subject: [PATCH 01/16] Lazy saving option added. --- ignite/contrib/handlers/clearml_logger.py | 9 ++++++++ ignite/handlers/checkpoint.py | 26 +++++++++++++---------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index a66a48477eca..7eab14e04384 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -804,6 +804,15 @@ def get_local_copy(self, filename: str) -> Optional[str]: return None + @idist.one_rank_only() + def _save_func(self, checkpoint: Mapping, path: str, func: Callable, rank: int = 0) -> None: + self._lazy_opts = (checkpoint, path, func, rank) + + @idist.one_rank_only() + def force(self): + opts = self._lazy_opts + super(ClearMLSaver, self).__call__(*opts) + @idist.one_rank_only() def remove(self, filename: str) -> None: super(ClearMLSaver, self).remove(filename) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 74a6a076bdba..476f6f8d151b 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -401,6 +401,18 @@ def __call__(self, engine: Engine) -> None: "priority": priority, } + self._saved.append(Checkpoint.Item(priority, filename)) + self._saved.sort(key=lambda it: it[0]) + + if self.include_self: + # Now that we've updated _saved, we can add our own state_dict. + checkpoint["checkpointer"] = self.state_dict() + + try: + self.save_handler(checkpoint, filename, metadata) + except TypeError: + self.save_handler(checkpoint, filename) + try: index = list(map(lambda it: it.filename == filename, self._saved)).index(True) to_remove = True @@ -413,17 +425,9 @@ def __call__(self, engine: Engine) -> None: if isinstance(self.save_handler, BaseSaveHandler): self.save_handler.remove(item.filename) - self._saved.append(Checkpoint.Item(priority, filename)) - self._saved.sort(key=lambda it: it[0]) - - if self.include_self: - # Now that we've updated _saved, we can add our own state_dict. - checkpoint["checkpointer"] = self.state_dict() - - try: - self.save_handler(checkpoint, filename, metadata) - except TypeError: - self.save_handler(checkpoint, filename) + # Handling lazy savers, see Issue: #1986. + if hasattr(self.save_handler, "force"): + self.save_handler.force() def _setup_checkpoint(self) -> Dict[str, Dict[Any, Any]]: checkpoint = {} From 495fc84827e3dfb3d339ee75599a9fa1a88c5d55 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 12 May 2021 01:27:35 +0300 Subject: [PATCH 02/16] Reverting from lazy save to direct save, allowing atomic option. --- ignite/contrib/handlers/clearml_logger.py | 9 --------- ignite/handlers/checkpoint.py | 4 ---- 2 files changed, 13 deletions(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index 7eab14e04384..a66a48477eca 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -804,15 +804,6 @@ def get_local_copy(self, filename: str) -> Optional[str]: return None - @idist.one_rank_only() - def _save_func(self, checkpoint: Mapping, path: str, func: Callable, rank: int = 0) -> None: - self._lazy_opts = (checkpoint, path, func, rank) - - @idist.one_rank_only() - def force(self): - opts = self._lazy_opts - super(ClearMLSaver, self).__call__(*opts) - @idist.one_rank_only() def remove(self, filename: str) -> None: super(ClearMLSaver, self).remove(filename) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 476f6f8d151b..2c10459e4287 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -425,10 +425,6 @@ def __call__(self, engine: Engine) -> None: if isinstance(self.save_handler, BaseSaveHandler): self.save_handler.remove(item.filename) - # Handling lazy savers, see Issue: #1986. - if hasattr(self.save_handler, "force"): - self.save_handler.force() - def _setup_checkpoint(self) -> Dict[str, Dict[Any, Any]]: checkpoint = {} if self.to_save is not None: From 5d9aaeeb61f8ff7ccb1032a80af76db203efc6e0 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 12 May 2021 01:44:29 +0300 Subject: [PATCH 03/16] Reverting from lazy save to direct save, allowing atomic option. --- ignite/handlers/checkpoint.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 2c10459e4287..8154bd0addc7 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -630,7 +630,9 @@ class DiskSaver(BaseSaveHandler): dirname: Directory path where the checkpoint will be saved atomic: if True, checkpoint is serialized to a temporary file, and then moved to final destination, so that files are guaranteed to not be damaged - (for example if exception occurs during saving). + (for example if exception occurs during saving). Setting ``atomic=True`` is + recommended if ``n_saved=1`` is set in checkpoint object. For details, see + issue `#1986`_ create_dir: if True, will create directory ``dirname`` if it doesnt exist. require_empty: If True, will raise exception if there are any files in the directory ``dirname``. @@ -638,6 +640,8 @@ class DiskSaver(BaseSaveHandler): .. versionchanged:: 0.4.2 Accept ``kwargs`` for `torch.save` or `xm.save`. + + .. _#1986: https://github.com/pytorch/ignite/issues/1986 """ def __init__( From 2509cda87b22bb46bf3e58afd578bd5f1b0e312e Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 12 May 2021 03:15:03 +0300 Subject: [PATCH 04/16] Clearml storagehelper implementation --- ignite/contrib/handlers/clearml_logger.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index a66a48477eca..142075efa0dd 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -806,7 +806,10 @@ def get_local_copy(self, filename: str) -> Optional[str]: @idist.one_rank_only() def remove(self, filename: str) -> None: - super(ClearMLSaver, self).remove(filename) + from clearml.storage.helper import StorageHelper + + helper = StorageHelper.get(filename) + helper.delete(filename) for slots in self._checkpoint_slots.values(): try: slots[slots.index(filename)] = None From 4b6a6095d0af28920917b3a71ed8bf2c8d4a21f8 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 12 May 2021 03:15:49 +0300 Subject: [PATCH 05/16] Typo fix. --- ignite/contrib/handlers/neptune_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/contrib/handlers/neptune_logger.py b/ignite/contrib/handlers/neptune_logger.py index 61cc7724507a..4c343a83e559 100644 --- a/ignite/contrib/handlers/neptune_logger.py +++ b/ignite/contrib/handlers/neptune_logger.py @@ -78,7 +78,7 @@ class NeptuneLogger(BaseLogger): project_name="shared/pytorch-ignite-integration", experiment_name="cnn-mnist", # Optional, params={"max_epochs": 10}, # Optional, - tags=["pytorch-ignite","minst"] # Optional + tags=["pytorch-ignite","mnist"] # Optional ) # Attach the logger to the trainer to log training loss at each iteration From 7c77901e074b6d671cd4b0e73210f66396a17da3 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 12 May 2021 03:29:33 +0300 Subject: [PATCH 06/16] check for n_saved corrected. --- ignite/handlers/checkpoint.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 8154bd0addc7..f92b370a97f7 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -363,7 +363,7 @@ def __call__(self, engine: Engine) -> None: global_step = engine.state.get_event_attrib_value(Events.ITERATION_COMPLETED) priority = global_step - if self._check_lt_n_saved() or self._compare_fn(priority): + if self._check_lt_n_saved(or_equal=True) or self._compare_fn(priority): priority_str = f"{priority}" if isinstance(priority, numbers.Integral) else f"{priority:.4f}" @@ -413,12 +413,8 @@ def __call__(self, engine: Engine) -> None: except TypeError: self.save_handler(checkpoint, filename) - try: - index = list(map(lambda it: it.filename == filename, self._saved)).index(True) - to_remove = True - except ValueError: - index = 0 - to_remove = not self._check_lt_n_saved() + index = list(map(lambda it: it.filename == filename, self._saved)).index(True) + to_remove = not self._check_lt_n_saved(or_equal=True) if to_remove: item = self._saved.pop(index) From 853283d9314e508b72cad0f95cfd0d1f2a64596f Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 12 May 2021 03:55:21 +0300 Subject: [PATCH 07/16] logger added to StorageHelper. --- ignite/contrib/handlers/clearml_logger.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index 142075efa0dd..9bf7260cbcbf 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -808,7 +808,8 @@ def get_local_copy(self, filename: str) -> Optional[str]: def remove(self, filename: str) -> None: from clearml.storage.helper import StorageHelper - helper = StorageHelper.get(filename) + clearml_logger = self._task.get_logger() + helper = StorageHelper.get(filename, logger=clearml_logger) helper.delete(filename) for slots in self._checkpoint_slots.values(): try: From fbd58e5b40e25279829606f2d8084237bbbf7447 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Sat, 15 May 2021 15:19:01 +0300 Subject: [PATCH 08/16] Neptune Logger typo, reverted. --- ignite/contrib/handlers/neptune_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/contrib/handlers/neptune_logger.py b/ignite/contrib/handlers/neptune_logger.py index 4c343a83e559..61cc7724507a 100644 --- a/ignite/contrib/handlers/neptune_logger.py +++ b/ignite/contrib/handlers/neptune_logger.py @@ -78,7 +78,7 @@ class NeptuneLogger(BaseLogger): project_name="shared/pytorch-ignite-integration", experiment_name="cnn-mnist", # Optional, params={"max_epochs": 10}, # Optional, - tags=["pytorch-ignite","mnist"] # Optional + tags=["pytorch-ignite","minst"] # Optional ) # Attach the logger to the trainer to log training loss at each iteration From ea3b39b170c72a79ebfb8ac8aed0d3b024c225af Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Sat, 15 May 2021 15:34:46 +0300 Subject: [PATCH 09/16] Docstring revised. Note section added instead of linking to issue. --- ignite/handlers/checkpoint.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index f92b370a97f7..de474ec0086a 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -627,17 +627,24 @@ class DiskSaver(BaseSaveHandler): atomic: if True, checkpoint is serialized to a temporary file, and then moved to final destination, so that files are guaranteed to not be damaged (for example if exception occurs during saving). Setting ``atomic=True`` is - recommended if ``n_saved=1`` is set in checkpoint object. For details, see - issue `#1986`_ + recommended if ``n_saved=1`` is set in checkpoint object. See notes below + for detail. create_dir: if True, will create directory ``dirname`` if it doesnt exist. require_empty: If True, will raise exception if there are any files in the directory ``dirname``. kwargs: Accepted keyword arguments for `torch.save` or `xm.save`. + Note: + Setting ``atomic=True`` is generally recommended as it prevents saving a + checkpoint in case of any corruption on the checkpoint in progress. If + ``n_saved > 1`` set in checkpoint object, this is not needed as there are + always at least 2 saved checkpoints, and hence non-corrupt checkpoint is + always present. However, when ``n_saved=1`` is set, then to protect only saved + checkpoint, ``atomic=True`` is the only option to preserve a non-corrupt + checkpoint. + .. versionchanged:: 0.4.2 Accept ``kwargs`` for `torch.save` or `xm.save`. - - .. _#1986: https://github.com/pytorch/ignite/issues/1986 """ def __init__( From a0a7448fbcc9fda5ca4a828fdd184c00105c82a0 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Sat, 15 May 2021 15:36:31 +0300 Subject: [PATCH 10/16] Docstring revised. Note section added instead of linking to issue. --- ignite/handlers/checkpoint.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index de474ec0086a..3e505fcd476a 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -635,13 +635,12 @@ class DiskSaver(BaseSaveHandler): kwargs: Accepted keyword arguments for `torch.save` or `xm.save`. Note: - Setting ``atomic=True`` is generally recommended as it prevents saving a - checkpoint in case of any corruption on the checkpoint in progress. If - ``n_saved > 1`` set in checkpoint object, this is not needed as there are - always at least 2 saved checkpoints, and hence non-corrupt checkpoint is - always present. However, when ``n_saved=1`` is set, then to protect only saved - checkpoint, ``atomic=True`` is the only option to preserve a non-corrupt - checkpoint. + Setting ``atomic=True`` prevents saving a checkpoint in case of any corruption + on the checkpoint in progress. If ``n_saved > 1`` set in checkpoint object, + this is not needed as there are always at least 2 saved checkpoints, and hence + non-corrupt checkpoint is always present. However, when ``n_saved=1`` is set, + then to protect only saved checkpoint, ``atomic=True`` is the only option to + preserve a non-corrupt checkpoint. .. versionchanged:: 0.4.2 Accept ``kwargs`` for `torch.save` or `xm.save`. From 46ed98b76a759e4c29484723b33ba3019ed51984 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Sat, 15 May 2021 15:37:41 +0300 Subject: [PATCH 11/16] Docstring revised. --- ignite/handlers/checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 3e505fcd476a..7dbc6c091969 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -637,8 +637,8 @@ class DiskSaver(BaseSaveHandler): Note: Setting ``atomic=True`` prevents saving a checkpoint in case of any corruption on the checkpoint in progress. If ``n_saved > 1`` set in checkpoint object, - this is not needed as there are always at least 2 saved checkpoints, and hence - non-corrupt checkpoint is always present. However, when ``n_saved=1`` is set, + this is not strictly needed as there are always at least 2 saved checkpoints, and + hence non-corrupt checkpoint is always present. However, when ``n_saved=1`` is set, then to protect only saved checkpoint, ``atomic=True`` is the only option to preserve a non-corrupt checkpoint. From 090f0dae3e33e51ce84e18819ab8225963952fec Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Mon, 17 May 2021 02:58:30 +0300 Subject: [PATCH 12/16] logger removed from StorageHelper params. --- ignite/contrib/handlers/clearml_logger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index 9bf7260cbcbf..b6047ad044b4 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -808,9 +808,9 @@ def get_local_copy(self, filename: str) -> Optional[str]: def remove(self, filename: str) -> None: from clearml.storage.helper import StorageHelper - clearml_logger = self._task.get_logger() - helper = StorageHelper.get(filename, logger=clearml_logger) + helper = StorageHelper.get(filename) helper.delete(filename) + for slots in self._checkpoint_slots.values(): try: slots[slots.index(filename)] = None From 5b2c981219b1294fafb10aae4ad9d5b64c2bb256 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Tue, 18 May 2021 01:32:03 +0300 Subject: [PATCH 13/16] Note section reduced in docstring. --- ignite/handlers/checkpoint.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 7dbc6c091969..2649815f819e 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -635,12 +635,9 @@ class DiskSaver(BaseSaveHandler): kwargs: Accepted keyword arguments for `torch.save` or `xm.save`. Note: - Setting ``atomic=True`` prevents saving a checkpoint in case of any corruption - on the checkpoint in progress. If ``n_saved > 1`` set in checkpoint object, - this is not strictly needed as there are always at least 2 saved checkpoints, and - hence non-corrupt checkpoint is always present. However, when ``n_saved=1`` is set, - then to protect only saved checkpoint, ``atomic=True`` is the only option to - preserve a non-corrupt checkpoint. + When ``n_saved=1`` is set in the checkpoint object, then to protect only saved + checkpoint, ``atomic=True`` is the only option to preserve a non-corrupt + checkpoint. .. versionchanged:: 0.4.2 Accept ``kwargs`` for `torch.save` or `xm.save`. From bdf2bb9b1bf2c243d22bdeaabbd42b5e5ef4a560 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Tue, 18 May 2021 21:38:29 +0300 Subject: [PATCH 14/16] Delete operation handle for earlier versions. --- ignite/contrib/handlers/clearml_logger.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index b6047ad044b4..c2223592c4c4 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -809,7 +809,12 @@ def remove(self, filename: str) -> None: from clearml.storage.helper import StorageHelper helper = StorageHelper.get(filename) - helper.delete(filename) + + try: + helper.delete(filename) + except ValueError: + warnings.warn("Checkpoints being uploaded to clearml-server with version " + "earlier than 1.0 does not support delete operation.") for slots in self._checkpoint_slots.values(): try: From 2c3f0c0d5ed652a84473ff058d185450d1d26d9f Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 19 May 2021 12:13:19 +0300 Subject: [PATCH 15/16] Black codestyle reformatted. --- ignite/contrib/handlers/clearml_logger.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index c2223592c4c4..81f45bcead3d 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -813,8 +813,10 @@ def remove(self, filename: str) -> None: try: helper.delete(filename) except ValueError: - warnings.warn("Checkpoints being uploaded to clearml-server with version " - "earlier than 1.0 does not support delete operation.") + warnings.warn( + "Checkpoints being uploaded to clearml-server with version " + "earlier than 1.0 does not support delete operation." + ) for slots in self._checkpoint_slots.values(): try: From 45c7eb69cb93e364b38583bae185a8de07d63dba Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 19 May 2021 12:14:59 +0300 Subject: [PATCH 16/16] Version reformatted as . --- ignite/contrib/handlers/clearml_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index 81f45bcead3d..35f7bc67a0f6 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -815,7 +815,7 @@ def remove(self, filename: str) -> None: except ValueError: warnings.warn( "Checkpoints being uploaded to clearml-server with version " - "earlier than 1.0 does not support delete operation." + "earlier than 1.0.0 does not support delete operation." ) for slots in self._checkpoint_slots.values():