microsoft · ant0nsc · Dec 13, 2021 · Dec 10, 2021 · Dec 10, 2021 · Dec 13, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -62,6 +62,7 @@ gets uploaded to AzureML, by skipping all test folders.
 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
 - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
 - ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.
+- ([#615](https://github.com/microsoft/InnerEye-DeepLearning/pull/615)) Minor changes to checkpoint download from AzureML.
 - ([#605](https://github.com/microsoft/InnerEye-DeepLearning/pull/605)) Make build jobs deterministic for regression testing.
 
 ### Fixed

diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py
@@ -240,9 +240,14 @@ def get_local_checkpoints_path_or_download(self) -> List[Path]:
         return checkpoint_paths
 
 
-def download_checkpoints_to_temp_folder(run: Optional[Run] = None, workspace: Optional[Workspace] = None) -> Path:
+def download_folder_from_run_to_temp_folder(folder: str,
+                                            run: Optional[Run] = None,
+                                            workspace: Optional[Workspace] = None) -> Path:
     """
-    Downloads all files with the outputs/checkpoints prefix of the given run to a temporary folder.
+    Downloads all files from a run that have the given prefix to a temporary folder.
+    For example, if the run contains files "foo/bar.txt" and "nothing.txt", and this function is called with
+    argument folder = "foo", it will return a path in a temp file system pointing to "bar.txt".
+
     In distributed training, the download only happens once per node.
 
     :param run: If provided, download the files from that run. If omitted, download the files from the current run
@@ -252,25 +257,25 @@ def download_checkpoints_to_temp_folder(run: Optional[Run] = None, workspace: Op
     :return: The path to which the files were downloaded. The files are located in that folder, without any further
         subfolders.
     """
+    if not is_running_in_azure_ml() and run is None:
+        raise ValueError("When running outside AzureML, the run to download from must be set.")
     run = run or RUN_CONTEXT
-    # Downloads should go to a temporary folder because downloading the files to the checkpoint folder might
-    # cause artifact conflicts later.
     temp_folder = Path(tempfile.mkdtemp())
-    checkpoint_prefix = f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/"
-    existing_checkpoints = get_run_file_names(run, prefix=checkpoint_prefix)
+    cleaned_prefix = folder.strip("/") + "/"
+    existing_checkpoints = get_run_file_names(run, prefix=cleaned_prefix)
     logging.info(f"Number of checkpoints available in AzureML: {len(existing_checkpoints)}")
     if len(existing_checkpoints) > 0:
         try:
             logging.info(f"Downloading checkpoints to {temp_folder}")
             download_files_from_run_id(run_id=run.id,
                                        output_folder=temp_folder,
-                                       prefix=checkpoint_prefix,
+                                       prefix=cleaned_prefix,
                                        workspace=workspace)
         except Exception as ex:
             logging.warning(f"Unable to download checkpoints from AzureML. Error: {str(ex)}")
     # Checkpoint downloads preserve the full folder structure, point the caller directly to the folder where the
     # checkpoints really are.
-    return temp_folder / DEFAULT_AML_UPLOAD_DIR / CHECKPOINT_FOLDER
+    return temp_folder / cleaned_prefix
 
 
 PathAndEpoch = Tuple[Path, int]
@@ -287,10 +292,11 @@ def find_recovery_checkpoint_and_epoch(path: Path) -> Optional[PathAndEpoch]:
     """
     available_checkpoints = find_all_recovery_checkpoints(path)
     if available_checkpoints is None and is_running_in_azure_ml():
-        logging.info("No recovery checkpoints available in the checkpoint folder. Trying to find checkpoints in "
-                     "AzureML from previous runs of this job.")
+        logging.info("No checkpoints available in the checkpoint folder. Trying to find checkpoints in AzureML.")
         # Download checkpoints from AzureML, then try to find recovery checkpoints among those.
-        temp_folder = download_checkpoints_to_temp_folder()
+        # Downloads should go to a temporary folder because downloading the files to the checkpoint folder might
+        # cause artifact conflicts later.
+        temp_folder = download_folder_from_run_to_temp_folder(folder=f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/")
         available_checkpoints = find_all_recovery_checkpoints(temp_folder)
     if available_checkpoints is not None:
         return extract_latest_checkpoint_and_epoch(available_checkpoints)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
@@ -29,7 +29,8 @@
 from InnerEye.Common import common_util, fixed_paths, fixed_paths_for_tests
 from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME, CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, \
     get_best_epoch_results_path
-from InnerEye.Common.fixed_paths import (DEFAULT_RESULT_IMAGE_NAME, DEFAULT_RESULT_ZIP_DICOM_NAME,
+from InnerEye.Common.fixed_paths import (DEFAULT_AML_UPLOAD_DIR, DEFAULT_RESULT_IMAGE_NAME,
+                                         DEFAULT_RESULT_ZIP_DICOM_NAME,
                                          PYTHON_ENVIRONMENT_NAME, repository_root_directory)
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
 from InnerEye.Common.output_directories import OutputFolderForTests
@@ -44,7 +45,7 @@
 from InnerEye.ML.reports.notebook_report import get_html_report_name
 from InnerEye.ML.run_ml import MLRunner
 from InnerEye.ML.runner import main
-from InnerEye.ML.utils.checkpoint_handling import download_checkpoints_to_temp_folder, \
+from InnerEye.ML.utils.checkpoint_handling import download_folder_from_run_to_temp_folder, \
     find_recovery_checkpoint_and_epoch
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
 from InnerEye.ML.utils.image_util import get_unit_image_header
@@ -225,7 +226,10 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
     Check that we can download checkpoint files from an AzureML run, if they are not available on disk.
     """
     run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
-    temp_folder = download_checkpoints_to_temp_folder(run, workspace=get_default_workspace())
+    checkpoint_folder = f"{DEFAULT_AML_UPLOAD_DIR}/{CHECKPOINT_FOLDER}/"
+    temp_folder = download_folder_from_run_to_temp_folder(folder=checkpoint_folder,
+                                                          run=run,
+                                                          workspace=get_default_workspace())
     files = list(temp_folder.glob("*"))
     assert len(files) == 2
     # Test if what's in the folder are really files, not directories
@@ -234,11 +238,12 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
     # Now test if that is correctly integrated into the checkpoint finder. To avoid downloading a second time,
     # now mock the call to the actual downloader.
     with mock.patch("InnerEye.ML.utils.checkpoint_handling.is_running_in_azure_ml", return_value=True):
-        with mock.patch("InnerEye.ML.utils.checkpoint_handling.download_checkpoints_to_temp_folder",
+        with mock.patch("InnerEye.ML.utils.checkpoint_handling.download_folder_from_run_to_temp_folder",
                         return_value=temp_folder) as download:
-            # Call the checkpoint finder with a temp folder that does not contain any files, so it should try to download
+            # Call the checkpoint finder with a temp folder that does not contain any files, so it should try to
+            # download
             result = find_recovery_checkpoint_and_epoch(test_output_dirs.root_dir)
-            download.assert_called_once_with()
+            download.assert_called_once_with(folder=checkpoint_folder)
             assert result is not None
             p, epoch = result
             # The basic model only writes one checkpoint at epoch 1