diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index c60437971cfe..02079c11f253 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -810,7 +810,18 @@ def get_local_copy(self, filename: str) -> Optional[str]: @idist.one_rank_only() def remove(self, filename: str) -> None: - super(ClearMLSaver, self).remove(filename) + from clearml.storage.helper import StorageHelper + + helper = StorageHelper.get(filename) + + try: + helper.delete(filename) + except ValueError: + warnings.warn( + "Checkpoints being uploaded to clearml-server with version " + "earlier than 1.0.0 does not support delete operation." + ) + for slots in self._checkpoint_slots.values(): try: slots[slots.index(filename)] = None diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 3291d6fde113..9344d85f25e3 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -362,7 +362,7 @@ def __call__(self, engine: Engine) -> None: global_step = engine.state.get_event_attrib_value(Events.ITERATION_COMPLETED) priority = global_step - if self._check_lt_n_saved() or self._compare_fn(priority): + if self._check_lt_n_saved(or_equal=True) or self._compare_fn(priority): priority_str = f"{priority}" if isinstance(priority, numbers.Integral) else f"{priority:.4f}" @@ -400,18 +400,6 @@ def __call__(self, engine: Engine) -> None: "priority": priority, } - try: - index = list(map(lambda it: it.filename == filename, self._saved)).index(True) - to_remove = True - except ValueError: - index = 0 - to_remove = not self._check_lt_n_saved() - - if to_remove: - item = self._saved.pop(index) - if isinstance(self.save_handler, BaseSaveHandler): - self.save_handler.remove(item.filename) - self._saved.append(Checkpoint.Item(priority, filename)) self._saved.sort(key=lambda it: it[0]) @@ -424,6 +412,14 @@ def __call__(self, engine: Engine) -> None: except TypeError: self.save_handler(checkpoint, filename) + index = list(map(lambda it: it.filename == filename, self._saved)).index(True) + to_remove = not self._check_lt_n_saved(or_equal=True) + + if to_remove: + item = self._saved.pop(index) + if isinstance(self.save_handler, BaseSaveHandler): + self.save_handler.remove(item.filename) + def _setup_checkpoint(self) -> Dict[str, Dict[Any, Any]]: checkpoint = {} if self.to_save is not None: @@ -629,12 +625,19 @@ class DiskSaver(BaseSaveHandler): dirname: Directory path where the checkpoint will be saved atomic: if True, checkpoint is serialized to a temporary file, and then moved to final destination, so that files are guaranteed to not be damaged - (for example if exception occurs during saving). + (for example if exception occurs during saving). Setting ``atomic=True`` is + recommended if ``n_saved=1`` is set in checkpoint object. See notes below + for detail. create_dir: if True, will create directory ``dirname`` if it doesnt exist. require_empty: If True, will raise exception if there are any files in the directory ``dirname``. kwargs: Accepted keyword arguments for `torch.save` or `xm.save`. + Note: + When ``n_saved=1`` is set in the checkpoint object, then to protect only saved + checkpoint, ``atomic=True`` is the only option to preserve a non-corrupt + checkpoint. + .. versionchanged:: 0.4.2 Accept ``kwargs`` for `torch.save` or `xm.save`. """