Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit c7eef5e

Browse files
authored
Downloading checkpoints from AML if not found on disk (#614)
Workaround for a temporary issue with low-priority preemption: checkpoint files are not available on disk upon job restart. Trying to download from AML.
1 parent a3f0046 commit c7eef5e

18 files changed

+291
-220
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ jobs that run in AzureML.
3232
- ([#589](https://github.com/microsoft/InnerEye-DeepLearning/pull/589)) Add `LightningContainer.update_azure_config()`
3333
hook to enable overriding `AzureConfig` parameters from a container (e.g. `experiment_name`, `cluster`, `num_nodes`).
3434
-([#603](https://github.com/microsoft/InnerEye-DeepLearning/pull/603)) Add histopathology module
35+
-([#614](https://github.com/microsoft/InnerEye-DeepLearning/pull/614)) Checkpoint downloading falls back to looking into AzureML if no checkpoints on disk
3536
-([#613](https://github.com/microsoft/InnerEye-DeepLearning/pull/613)) Add additional tests for histopathology datasets
3637

3738

InnerEye/ML/common.py

Lines changed: 16 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,13 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
44
# ------------------------------------------------------------------------------------------
5+
from __future__ import annotations
6+
57
import abc
6-
import logging
7-
import re
88
from datetime import datetime
99
from enum import Enum, unique
1010
from pathlib import Path
11-
from typing import Any, Dict, List, Optional, Tuple
12-
13-
import numpy as np
11+
from typing import Any, Dict, List
1412

1513
DATASET_CSV_FILE_NAME = "dataset.csv"
1614
CHECKPOINT_SUFFIX = ".ckpt"
@@ -26,6 +24,13 @@
2624
LAST_CHECKPOINT_FILE_NAME = "last"
2725
LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX = LAST_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX
2826

27+
FINAL_MODEL_FOLDER = "final_model"
28+
FINAL_ENSEMBLE_MODEL_FOLDER = "final_ensemble_model"
29+
CHECKPOINT_FOLDER = "checkpoints"
30+
VISUALIZATION_FOLDER = "visualizations"
31+
EXTRA_RUN_SUBFOLDER = "extra_run_id"
32+
ARGS_TXT = "args.txt"
33+
2934

3035
@unique
3136
class ModelExecutionMode(Enum):
@@ -64,18 +69,14 @@ def get_feature_length(self, column: str) -> int:
6469
raise NotImplementedError("get_feature_length must be implemented by sub classes")
6570

6671

67-
def get_recovery_checkpoint_path(path: Path) -> Path:
72+
def create_unique_timestamp_id() -> str:
6873
"""
69-
Returns the path to the last recovery checkpoint in the given folder or the provided filename. Raises a
70-
FileNotFoundError if no
71-
recovery checkpoint file is present.
72-
:param path: Path to checkpoint folder
74+
Creates a unique string using the current time in UTC, up to seconds precision, with characters that
75+
are suitable for use in filenames. For example, on 31 Dec 2019 at 11:59:59pm UTC, the result would be
76+
2019-12-31T235959Z.
7377
"""
74-
recovery_ckpt_and_epoch = find_recovery_checkpoint_and_epoch(path)
75-
if recovery_ckpt_and_epoch is not None:
76-
return recovery_ckpt_and_epoch[0]
77-
files = list(path.glob("*"))
78-
raise FileNotFoundError(f"No checkpoint files found in {path}. Existing files: {' '.join(p.name for p in files)}")
78+
unique_id = datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
79+
return unique_id
7980

8081

8182
def get_best_checkpoint_path(path: Path) -> Path:
@@ -84,73 +85,3 @@ def get_best_checkpoint_path(path: Path) -> Path:
8485
:param path to checkpoint folder
8586
"""
8687
return path / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
87-
88-
89-
def find_all_recovery_checkpoints(path: Path) -> Optional[List[Path]]:
90-
"""
91-
Extracts all file starting with RECOVERY_CHECKPOINT_FILE_NAME in path
92-
:param path:
93-
:return:
94-
"""
95-
all_recovery_files = [f for f in path.glob(RECOVERY_CHECKPOINT_FILE_NAME + "*")]
96-
if len(all_recovery_files) == 0:
97-
return None
98-
return all_recovery_files
99-
100-
101-
PathAndEpoch = Tuple[Path, int]
102-
103-
104-
def extract_latest_checkpoint_and_epoch(available_files: List[Path]) -> PathAndEpoch:
105-
"""
106-
Checkpoints are saved as recovery_epoch={epoch}.ckpt, find the latest ckpt and epoch number.
107-
:param available_files: all available checkpoints
108-
:return: path the checkpoint from latest epoch and epoch number
109-
"""
110-
recovery_epochs = [int(re.findall(r"[\d]+", f.stem)[0]) for f in available_files]
111-
idx_max_epoch = int(np.argmax(recovery_epochs))
112-
return available_files[idx_max_epoch], recovery_epochs[idx_max_epoch]
113-
114-
115-
def find_recovery_checkpoint_and_epoch(path: Path) -> Optional[PathAndEpoch]:
116-
"""
117-
Looks at all the recovery files, extracts the epoch number for all of them and returns the most recent (latest
118-
epoch)
119-
checkpoint path along with the corresponding epoch number. If no recovery checkpoint are found, return None.
120-
:param path: The folder to start searching in.
121-
:return: None if there is no file matching the search pattern, or a Tuple with Path object and integer pointing to
122-
recovery checkpoint path and recovery epoch.
123-
"""
124-
available_checkpoints = find_all_recovery_checkpoints(path)
125-
if available_checkpoints is not None:
126-
return extract_latest_checkpoint_and_epoch(available_checkpoints)
127-
return None
128-
129-
130-
def create_best_checkpoint(path: Path) -> Path:
131-
"""
132-
Creates the best checkpoint file. "Best" is at the moment defined as being the last checkpoint, but could be
133-
based on some defined policy.
134-
The best checkpoint will be renamed to `best_checkpoint.ckpt`.
135-
:param path: The folder that contains all checkpoint files.
136-
"""
137-
logging.debug(f"Files in checkpoint folder: {' '.join(p.name for p in path.glob('*'))}")
138-
last_ckpt = path / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
139-
all_files = f"Existing files: {' '.join(p.name for p in path.glob('*'))}"
140-
if not last_ckpt.is_file():
141-
raise FileNotFoundError(f"Checkpoint file {LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX} not found. {all_files}")
142-
logging.info(f"Using {LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX} as the best checkpoint: Renaming to "
143-
f"{BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX}")
144-
best = path / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
145-
last_ckpt.rename(best)
146-
return best
147-
148-
149-
def create_unique_timestamp_id() -> str:
150-
"""
151-
Creates a unique string using the current time in UTC, up to seconds precision, with characters that
152-
are suitable for use in filenames. For example, on 31 Dec 2019 at 11:59:59pm UTC, the result would be
153-
2019-12-31T235959Z.
154-
"""
155-
unique_id = datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
156-
return unique_id

InnerEye/ML/deep_learning_config.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,9 @@
1919
from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR_NAME
2020
from InnerEye.Common.generic_parsing import GenericConfig
2121
from InnerEye.Common.type_annotations import PathOrString, T, TupleFloat2
22-
from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, create_unique_timestamp_id, \
23-
get_best_checkpoint_path, get_recovery_checkpoint_path
24-
25-
# A folder inside of the outputs folder that will contain all information for running the model in inference mode
26-
27-
FINAL_MODEL_FOLDER = "final_model"
28-
FINAL_ENSEMBLE_MODEL_FOLDER = "final_ensemble_model"
29-
30-
# The checkpoints must be stored inside of the final model folder, if we want to avoid copying
31-
# them before registration.
32-
CHECKPOINT_FOLDER = "checkpoints"
33-
VISUALIZATION_FOLDER = "visualizations"
34-
EXTRA_RUN_SUBFOLDER = "extra_run_id"
35-
36-
ARGS_TXT = "args.txt"
22+
from InnerEye.ML.common import CHECKPOINT_FOLDER, DATASET_CSV_FILE_NAME, \
23+
ModelExecutionMode, VISUALIZATION_FOLDER, \
24+
create_unique_timestamp_id, get_best_checkpoint_path
3725

3826

3927
@unique
@@ -487,6 +475,7 @@ def get_path_to_checkpoint(self) -> Path:
487475
"""
488476
Returns the full path to a recovery checkpoint.
489477
"""
478+
from InnerEye.ML.utils.checkpoint_handling import get_recovery_checkpoint_path
490479
return get_recovery_checkpoint_path(self.checkpoint_folder)
491480

492481
def get_path_to_best_checkpoint(self) -> Path:

InnerEye/ML/lightning_container.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
WorkflowParams
2424
from InnerEye.ML.utils import model_util
2525
from InnerEye.ML.utils.lr_scheduler import SchedulerWithWarmUp
26-
from InnerEye.ML.utils.run_recovery import RunRecovery
2726

2827

2928
class InnerEyeInference(abc.ABC):
@@ -151,7 +150,8 @@ def __init__(self, **kwargs: Any) -> None:
151150
super().__init__(**kwargs)
152151
self._model: Optional[LightningModule] = None
153152
self._model_name = type(self).__name__
154-
self.pretraining_run_checkpoints: Optional[RunRecovery] = None
153+
# This should be typed RunRecovery, but causes circular imports
154+
self.pretraining_run_checkpoints: Optional[Any] = None
155155
self.num_nodes = 1
156156

157157
def validate(self) -> None:

InnerEye/ML/model_training.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context
2020
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, change_working_directory
2121
from InnerEye.Common.resource_monitor import ResourceMonitor
22-
from InnerEye.ML.common import ModelExecutionMode, RECOVERY_CHECKPOINT_FILE_NAME, create_best_checkpoint
23-
from InnerEye.ML.deep_learning_config import ARGS_TXT, VISUALIZATION_FOLDER
22+
from InnerEye.ML.common import ARGS_TXT, ModelExecutionMode, RECOVERY_CHECKPOINT_FILE_NAME, VISUALIZATION_FOLDER
23+
from InnerEye.ML.utils.checkpoint_handling import create_best_checkpoint
2424
from InnerEye.ML.lightning_base import InnerEyeContainer, InnerEyeLightning
2525
from InnerEye.ML.lightning_container import LightningContainer
2626
from InnerEye.ML.lightning_loggers import StoringLogger

InnerEye/ML/normalize_and_visualize_dataset.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@
1515
from InnerEye.Common.common_util import logging_to_stdout
1616
from InnerEye.Common.generic_parsing import GenericConfig
1717
from InnerEye.ML import plotting
18-
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
18+
from InnerEye.ML.common import ARGS_TXT, DATASET_CSV_FILE_NAME
1919
from InnerEye.ML.config import SegmentationModelBase
2020
from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources
21-
from InnerEye.ML.deep_learning_config import ARGS_TXT
2221
from InnerEye.ML.photometric_normalization import PhotometricNormalization
2322
from InnerEye.ML.utils.config_loader import ModelConfigLoader
2423
from InnerEye.ML.utils.io_util import load_images_from_dataset_source

InnerEye/ML/run_ml.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,12 @@
3636
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PYTHON_ENVIRONMENT_NAME
3737
from InnerEye.Common.type_annotations import PathOrString
3838
from InnerEye.ML.baselines_util import compare_folders_and_run_outputs
39-
from InnerEye.ML.common import ModelExecutionMode
39+
from InnerEye.ML.common import CHECKPOINT_FOLDER, EXTRA_RUN_SUBFOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, \
40+
FINAL_MODEL_FOLDER, \
41+
ModelExecutionMode
4042
from InnerEye.ML.config import SegmentationModelBase
41-
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, EXTRA_RUN_SUBFOLDER, \
42-
FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod, load_checkpoint
43+
from InnerEye.ML.deep_learning_config import DeepLearningConfig, ModelCategory, MultiprocessingStartMethod, \
44+
load_checkpoint
4345
from InnerEye.ML.lightning_base import InnerEyeContainer
4446
from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
4547
from InnerEye.ML.lightning_loggers import StoringLogger
@@ -53,8 +55,7 @@
5355
get_ipynb_report_name, reports_folder
5456
from InnerEye.ML.scalar_config import ScalarModelBase
5557
from InnerEye.ML.sequence_config import SequenceModelBase
56-
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
57-
from InnerEye.ML.utils.run_recovery import RunRecovery
58+
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler, download_all_checkpoints_from_run
5859
from InnerEye.ML.visualizers import activation_maps
5960
from InnerEye.ML.visualizers.plot_cross_validation import \
6061
get_config_and_results_for_offline_runs, plot_cross_validation_from_files
@@ -183,10 +184,10 @@ def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None:
183184
if self.container.pretraining_run_recovery_id is not None:
184185
run_to_recover = self.azure_config.fetch_run(self.container.pretraining_run_recovery_id.strip())
185186
only_return_path = not is_global_rank_zero()
186-
run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.container,
187-
run_to_recover,
188-
EXTRA_RUN_SUBFOLDER,
189-
only_return_path=only_return_path)
187+
run_recovery_object = download_all_checkpoints_from_run(self.container,
188+
run_to_recover,
189+
EXTRA_RUN_SUBFOLDER,
190+
only_return_path=only_return_path)
190191
self.container.pretraining_run_checkpoints = run_recovery_object
191192

192193
# A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.

0 commit comments

Comments
 (0)