Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ gets uploaded to AzureML, by skipping all test folders.
- ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
- ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
- ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.
- ([#615](https://github.com/microsoft/InnerEye-DeepLearning/pull/615)) Minor changes to checkpoint download from AzureML.
- ([#605](https://github.com/microsoft/InnerEye-DeepLearning/pull/605)) Make build jobs deterministic for regression testing.

### Fixed
Expand Down
28 changes: 17 additions & 11 deletions InnerEye/ML/utils/checkpoint_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,14 @@ def get_local_checkpoints_path_or_download(self) -> List[Path]:
return checkpoint_paths


def download_checkpoints_to_temp_folder(run: Optional[Run] = None, workspace: Optional[Workspace] = None) -> Path:
def download_folder_from_run_to_temp_folder(folder: str,
run: Optional[Run] = None,
workspace: Optional[Workspace] = None) -> Path:
"""
Downloads all files with the outputs/checkpoints prefix of the given run to a temporary folder.
Downloads all files from a run that have the given prefix to a temporary folder.
For example, if the run contains files "foo/bar.txt" and "nothing.txt", and this function is called with
argument folder = "foo", it will return a path in a temp file system pointing to "bar.txt".

In distributed training, the download only happens once per node.

:param run: If provided, download the files from that run. If omitted, download the files from the current run
Expand All @@ -252,25 +257,25 @@ def download_checkpoints_to_temp_folder(run: Optional[Run] = None, workspace: Op
:return: The path to which the files were downloaded. The files are located in that folder, without any further
subfolders.
"""
if not is_running_in_azure_ml() and run is None:
raise ValueError("When running outside AzureML, the run to download from must be set.")
run = run or RUN_CONTEXT
# Downloads should go to a temporary folder because downloading the files to the checkpoint folder might
# cause artifact conflicts later.
temp_folder = Path(tempfile.mkdtemp())
checkpoint_prefix = f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/"
existing_checkpoints = get_run_file_names(run, prefix=checkpoint_prefix)
cleaned_prefix = folder.strip("/") + "/"
existing_checkpoints = get_run_file_names(run, prefix=cleaned_prefix)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still refer to checkpoints here and a few times below, but these files don't necessarily have to be checkpoints. Rename to something more generic?

logging.info(f"Number of checkpoints available in AzureML: {len(existing_checkpoints)}")
if len(existing_checkpoints) > 0:
try:
logging.info(f"Downloading checkpoints to {temp_folder}")
download_files_from_run_id(run_id=run.id,
output_folder=temp_folder,
prefix=checkpoint_prefix,
prefix=cleaned_prefix,
workspace=workspace)
except Exception as ex:
logging.warning(f"Unable to download checkpoints from AzureML. Error: {str(ex)}")
# Checkpoint downloads preserve the full folder structure, point the caller directly to the folder where the
# checkpoints really are.
return temp_folder / DEFAULT_AML_UPLOAD_DIR / CHECKPOINT_FOLDER
return temp_folder / cleaned_prefix


PathAndEpoch = Tuple[Path, int]
Expand All @@ -287,10 +292,11 @@ def find_recovery_checkpoint_and_epoch(path: Path) -> Optional[PathAndEpoch]:
"""
available_checkpoints = find_all_recovery_checkpoints(path)
if available_checkpoints is None and is_running_in_azure_ml():
logging.info("No recovery checkpoints available in the checkpoint folder. Trying to find checkpoints in "
"AzureML from previous runs of this job.")
logging.info("No checkpoints available in the checkpoint folder. Trying to find checkpoints in AzureML.")
# Download checkpoints from AzureML, then try to find recovery checkpoints among those.
temp_folder = download_checkpoints_to_temp_folder()
# Downloads should go to a temporary folder because downloading the files to the checkpoint folder might
# cause artifact conflicts later.
temp_folder = download_folder_from_run_to_temp_folder(folder=f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/")
available_checkpoints = find_all_recovery_checkpoints(temp_folder)
if available_checkpoints is not None:
return extract_latest_checkpoint_and_epoch(available_checkpoints)
Expand Down
17 changes: 11 additions & 6 deletions Tests/AfterTraining/test_after_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
from InnerEye.Common import common_util, fixed_paths, fixed_paths_for_tests
from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME, CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, \
get_best_epoch_results_path
from InnerEye.Common.fixed_paths import (DEFAULT_RESULT_IMAGE_NAME, DEFAULT_RESULT_ZIP_DICOM_NAME,
from InnerEye.Common.fixed_paths import (DEFAULT_AML_UPLOAD_DIR, DEFAULT_RESULT_IMAGE_NAME,
DEFAULT_RESULT_ZIP_DICOM_NAME,
PYTHON_ENVIRONMENT_NAME, repository_root_directory)
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.output_directories import OutputFolderForTests
Expand All @@ -44,7 +45,7 @@
from InnerEye.ML.reports.notebook_report import get_html_report_name
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.runner import main
from InnerEye.ML.utils.checkpoint_handling import download_checkpoints_to_temp_folder, \
from InnerEye.ML.utils.checkpoint_handling import download_folder_from_run_to_temp_folder, \
find_recovery_checkpoint_and_epoch
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.image_util import get_unit_image_header
Expand Down Expand Up @@ -225,7 +226,10 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
Check that we can download checkpoint files from an AzureML run, if they are not available on disk.
"""
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
temp_folder = download_checkpoints_to_temp_folder(run, workspace=get_default_workspace())
checkpoint_folder = f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/"
temp_folder = download_folder_from_run_to_temp_folder(folder=checkpoint_folder,
run=run,
workspace=get_default_workspace())
files = list(temp_folder.glob("*"))
assert len(files) == 2
# Test if what's in the folder are really files, not directories
Expand All @@ -234,11 +238,12 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
# Now test if that is correctly integrated into the checkpoint finder. To avoid downloading a second time,
# now mock the call to the actual downloader.
with mock.patch("InnerEye.ML.utils.checkpoint_handling.is_running_in_azure_ml", return_value=True):
with mock.patch("InnerEye.ML.utils.checkpoint_handling.download_checkpoints_to_temp_folder",
with mock.patch("InnerEye.ML.utils.checkpoint_handling.download_folder_from_run_to_temp_folder",
return_value=temp_folder) as download:
# Call the checkpoint finder with a temp folder that does not contain any files, so it should try to download
# Call the checkpoint finder with a temp folder that does not contain any files, so it should try to
# download
result = find_recovery_checkpoint_and_epoch(test_output_dirs.root_dir)
download.assert_called_once_with()
download.assert_called_once_with(folder=checkpoint_folder)
assert result is not None
p, epoch = result
# The basic model only writes one checkpoint at epoch 1
Expand Down