diff --git a/CHANGELOG.md b/CHANGELOG.md index c4733a078..c34cb5bd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ gets uploaded to AzureML, by skipping all test folders. - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`. - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package - ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml. +- ([#615](https://github.com/microsoft/InnerEye-DeepLearning/pull/615)) Minor changes to checkpoint download from AzureML. - ([#605](https://github.com/microsoft/InnerEye-DeepLearning/pull/605)) Make build jobs deterministic for regression testing. ### Fixed diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py index 7cbe03060..9b187a97a 100644 --- a/InnerEye/ML/utils/checkpoint_handling.py +++ b/InnerEye/ML/utils/checkpoint_handling.py @@ -240,9 +240,14 @@ def get_local_checkpoints_path_or_download(self) -> List[Path]: return checkpoint_paths -def download_checkpoints_to_temp_folder(run: Optional[Run] = None, workspace: Optional[Workspace] = None) -> Path: +def download_folder_from_run_to_temp_folder(folder: str, + run: Optional[Run] = None, + workspace: Optional[Workspace] = None) -> Path: """ - Downloads all files with the outputs/checkpoints prefix of the given run to a temporary folder. + Downloads all files from a run that have the given prefix to a temporary folder. + For example, if the run contains files "foo/bar.txt" and "nothing.txt", and this function is called with + argument folder = "foo", it will return a path in a temp file system pointing to "bar.txt". + In distributed training, the download only happens once per node. :param run: If provided, download the files from that run. If omitted, download the files from the current run @@ -252,25 +257,25 @@ def download_checkpoints_to_temp_folder(run: Optional[Run] = None, workspace: Op :return: The path to which the files were downloaded. The files are located in that folder, without any further subfolders. """ + if not is_running_in_azure_ml() and run is None: + raise ValueError("When running outside AzureML, the run to download from must be set.") run = run or RUN_CONTEXT - # Downloads should go to a temporary folder because downloading the files to the checkpoint folder might - # cause artifact conflicts later. temp_folder = Path(tempfile.mkdtemp()) - checkpoint_prefix = f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/" - existing_checkpoints = get_run_file_names(run, prefix=checkpoint_prefix) + cleaned_prefix = folder.strip("/") + "/" + existing_checkpoints = get_run_file_names(run, prefix=cleaned_prefix) logging.info(f"Number of checkpoints available in AzureML: {len(existing_checkpoints)}") if len(existing_checkpoints) > 0: try: logging.info(f"Downloading checkpoints to {temp_folder}") download_files_from_run_id(run_id=run.id, output_folder=temp_folder, - prefix=checkpoint_prefix, + prefix=cleaned_prefix, workspace=workspace) except Exception as ex: logging.warning(f"Unable to download checkpoints from AzureML. Error: {str(ex)}") # Checkpoint downloads preserve the full folder structure, point the caller directly to the folder where the # checkpoints really are. - return temp_folder / DEFAULT_AML_UPLOAD_DIR / CHECKPOINT_FOLDER + return temp_folder / cleaned_prefix PathAndEpoch = Tuple[Path, int] @@ -287,10 +292,11 @@ def find_recovery_checkpoint_and_epoch(path: Path) -> Optional[PathAndEpoch]: """ available_checkpoints = find_all_recovery_checkpoints(path) if available_checkpoints is None and is_running_in_azure_ml(): - logging.info("No recovery checkpoints available in the checkpoint folder. Trying to find checkpoints in " - "AzureML from previous runs of this job.") + logging.info("No checkpoints available in the checkpoint folder. Trying to find checkpoints in AzureML.") # Download checkpoints from AzureML, then try to find recovery checkpoints among those. - temp_folder = download_checkpoints_to_temp_folder() + # Downloads should go to a temporary folder because downloading the files to the checkpoint folder might + # cause artifact conflicts later. + temp_folder = download_folder_from_run_to_temp_folder(folder=f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/") available_checkpoints = find_all_recovery_checkpoints(temp_folder) if available_checkpoints is not None: return extract_latest_checkpoint_and_epoch(available_checkpoints) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 48176b389..99e651c0a 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -29,7 +29,8 @@ from InnerEye.Common import common_util, fixed_paths, fixed_paths_for_tests from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME, CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, \ get_best_epoch_results_path -from InnerEye.Common.fixed_paths import (DEFAULT_RESULT_IMAGE_NAME, DEFAULT_RESULT_ZIP_DICOM_NAME, +from InnerEye.Common.fixed_paths import (DEFAULT_AML_UPLOAD_DIR, DEFAULT_RESULT_IMAGE_NAME, + DEFAULT_RESULT_ZIP_DICOM_NAME, PYTHON_ENVIRONMENT_NAME, repository_root_directory) from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path from InnerEye.Common.output_directories import OutputFolderForTests @@ -44,7 +45,7 @@ from InnerEye.ML.reports.notebook_report import get_html_report_name from InnerEye.ML.run_ml import MLRunner from InnerEye.ML.runner import main -from InnerEye.ML.utils.checkpoint_handling import download_checkpoints_to_temp_folder, \ +from InnerEye.ML.utils.checkpoint_handling import download_folder_from_run_to_temp_folder, \ find_recovery_checkpoint_and_epoch from InnerEye.ML.utils.config_loader import ModelConfigLoader from InnerEye.ML.utils.image_util import get_unit_image_header @@ -225,7 +226,10 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) - Check that we can download checkpoint files from an AzureML run, if they are not available on disk. """ run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN) - temp_folder = download_checkpoints_to_temp_folder(run, workspace=get_default_workspace()) + checkpoint_folder = f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/" + temp_folder = download_folder_from_run_to_temp_folder(folder=checkpoint_folder, + run=run, + workspace=get_default_workspace()) files = list(temp_folder.glob("*")) assert len(files) == 2 # Test if what's in the folder are really files, not directories @@ -234,11 +238,12 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) - # Now test if that is correctly integrated into the checkpoint finder. To avoid downloading a second time, # now mock the call to the actual downloader. with mock.patch("InnerEye.ML.utils.checkpoint_handling.is_running_in_azure_ml", return_value=True): - with mock.patch("InnerEye.ML.utils.checkpoint_handling.download_checkpoints_to_temp_folder", + with mock.patch("InnerEye.ML.utils.checkpoint_handling.download_folder_from_run_to_temp_folder", return_value=temp_folder) as download: - # Call the checkpoint finder with a temp folder that does not contain any files, so it should try to download + # Call the checkpoint finder with a temp folder that does not contain any files, so it should try to + # download result = find_recovery_checkpoint_and_epoch(test_output_dirs.root_dir) - download.assert_called_once_with() + download.assert_called_once_with(folder=checkpoint_folder) assert result is not None p, epoch = result # The basic model only writes one checkpoint at epoch 1