From 8348bbf4bc96e152b3c2fee08f7c33d8acf22a9d Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Mon, 7 Jun 2021 10:29:41 +0100
Subject: [PATCH 01/21] Only download checkpoints on node 0

---
 InnerEye/ML/run_ml.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
index 94c5f7bd8..bb509de4d 100644
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@@ -44,7 +44,7 @@
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.model_inference_config import ModelInferenceConfig
 from InnerEye.ML.model_testing import model_test
-from InnerEye.ML.model_training import create_lightning_trainer, model_train
+from InnerEye.ML.model_training import create_lightning_trainer, is_global_rank_zero, model_train
 from InnerEye.ML.reports.notebook_report import generate_classification_crossval_notebook, \
     generate_classification_multilabel_notebook, generate_classification_notebook, generate_segmentation_notebook, \
     get_ipynb_report_name, reports_folder
@@ -222,7 +222,8 @@ def setup(self, use_mount_or_download_dataset: bool = True) -> None:
                                                     azure_config=self.azure_config,
                                                     project_root=self.project_root,
                                                     run_context=RUN_CONTEXT)
-        self.checkpoint_handler.download_recovery_checkpoints_or_weights()
+        if is_global_rank_zero():
+            self.checkpoint_handler.download_recovery_checkpoints_or_weights()
 
         # A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
         if isinstance(self.container, InnerEyeContainer):

From 0bdaf68166159d528b7993a6def8b389f8a6bf57 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Mon, 7 Jun 2021 13:21:34 +0100
Subject: [PATCH 02/21] Try out new test

---
 Tests/AfterTraining/test_after_training.py | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 3975647db..1250be9d2 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -15,6 +15,7 @@
 import sys
 from pathlib import Path
 from typing import List
+from unittest import mock
 
 import numpy as np
 import pytest
@@ -38,6 +39,7 @@
 from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, ModelCategory
 from InnerEye.ML.model_inference_config import read_model_inference_config
 from InnerEye.ML.reports.notebook_report import get_html_report_name
+from InnerEye.ML.runner import main
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
 from InnerEye.ML.utils.image_util import get_unit_image_header
 from InnerEye.ML.utils.io_util import zip_random_dicom_series
@@ -217,6 +219,34 @@ def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderFor
     submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
     assert seg_path.exists(), f"Result file {seg_path} was not created"
 
+@pytest.mark.after_training_2node
+def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
+    args_list = ["--model", "BasicModel2EpochsMoreData",
+                 "--azureml", "True",
+                 "--num_nodes", "2",
+                 "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
+                 "--num_epochs", "3",
+                 ]
+    with mock.patch("sys.argv", [""] + args_list):
+        main()
+    print(get_most_recent_run_id())
+    run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
+    assert run.status == RunStatus.COMPLETED
+    files = run.get_file_names()
+    # There are two nodes, so there should be one log file per node.
+    log0_path = "azureml-logs/70_driver_log_0.txt"
+    log1_path = "azureml-logs/70_driver_log_1.txt"
+    assert log0_path in files, "Node rank 0 log file is missing"
+    assert log1_path in files, "Node rank 1 log file is missing"
+    # Download both log files and check their contents
+    log0 = test_output_dirs.root_dir / log0_path
+    log1 = test_output_dirs.root_dir / log1_path
+    run.download_file(log0_path, output_file_path=str(log0))
+    run.download_file(log1_path, output_file_path=str(log1))
+    log0_txt = log0.read_text()
+    log1_txt = log1.read_text()
+    assert "Downloading multiple files from run" in log0_txt
+    assert "Downloading multiple files from run" not in log1_txt
 
 def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool:
     return f"{CROSSVAL_RESULTS_FOLDER}/{split}/{mode.value}/metrics.csv" in available_files

From 82631c5e9a9bbb24512c6d0f50bc4b9245361c8e Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Mon, 7 Jun 2021 14:04:23 +0100
Subject: [PATCH 03/21] Try out new test

---
 Tests/AfterTraining/test_after_training.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 1250be9d2..357f0a0d0 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -30,7 +30,7 @@
 from InnerEye.Common.common_util import CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, get_best_epoch_results_path
 from InnerEye.Common.fixed_paths import DEFAULT_AML_LOGS_DIR, DEFAULT_RESULT_IMAGE_NAME, \
     DEFAULT_RESULT_ZIP_DICOM_NAME, \
-    PYTHON_ENVIRONMENT_NAME
+    PYTHON_ENVIRONMENT_NAME, repository_root_directory
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.Common.spawn_subprocess import spawn_and_monitor_subprocess
@@ -227,7 +227,8 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
                  "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
                  "--num_epochs", "3",
                  ]
-    with mock.patch("sys.argv", [""] + args_list):
+    script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
+    with mock.patch("sys.argv", [script] + args_list):
         main()
     print(get_most_recent_run_id())
     run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)

From acf6268be4e07511e73eee1b01eb3d3970c1e67b Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Mon, 7 Jun 2021 16:06:07 +0100
Subject: [PATCH 04/21] Try out new test

---
 Tests/AfterTraining/test_after_training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 357f0a0d0..9bc91fc6f 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -226,6 +226,7 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
                  "--num_nodes", "2",
                  "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
                  "--num_epochs", "3",
+                 "--wait_for_completion", "True"
                  ]
     script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
     with mock.patch("sys.argv", [script] + args_list):

From d2016843eef5a6c872715c9fa993ee503c132883 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Mon, 7 Jun 2021 17:29:13 +0100
Subject: [PATCH 05/21] Try out new test

---
 Tests/AfterTraining/test_after_training.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 9bc91fc6f..494a2414a 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -13,6 +13,7 @@
 import os
 import shutil
 import sys
+import time
 from pathlib import Path
 from typing import List
 from unittest import mock
@@ -225,14 +226,14 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
                  "--azureml", "True",
                  "--num_nodes", "2",
                  "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
-                 "--num_epochs", "3",
-                 "--wait_for_completion", "True"
+                 "--num_epochs", "3"
                  ]
     script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
     with mock.patch("sys.argv", [script] + args_list):
         main()
-    print(get_most_recent_run_id())
     run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
+    while run.status not in [RunStatus.COMPLETED, RunStatus.FAILED]:
+        time.sleep(10)
     assert run.status == RunStatus.COMPLETED
     files = run.get_file_names()
     # There are two nodes, so there should be one log file per node.

From 69798e594f5cdc66278c7c8c73ec73793b1b0719 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Mon, 7 Jun 2021 18:43:53 +0100
Subject: [PATCH 06/21] Try out new test

---
 Tests/AfterTraining/test_after_training.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 494a2414a..151b2ecc0 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -226,7 +226,8 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
                  "--azureml", "True",
                  "--num_nodes", "2",
                  "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
-                 "--num_epochs", "3"
+                 "--num_epochs", "3",
+                 "--max_nums_gpus", "1"
                  ]
     script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
     with mock.patch("sys.argv", [script] + args_list):

From b3e7c661bc17b35a8fd5f8ad65739279676ef43d Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Tue, 8 Jun 2021 09:09:20 +0100
Subject: [PATCH 07/21] Try out new test

---
 Tests/AfterTraining/test_after_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 151b2ecc0..584a0629f 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -227,7 +227,7 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
                  "--num_nodes", "2",
                  "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
                  "--num_epochs", "3",
-                 "--max_nums_gpus", "1"
+                 "--max_num_gpus", "1"
                  ]
     script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
     with mock.patch("sys.argv", [script] + args_list):

From f54fe2852fbf74408a332b50e35bf5c260c1f6d7 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Wed, 9 Jun 2021 09:03:40 +0100
Subject: [PATCH 08/21] Try update env

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 834011306..6e9a85e5f 100644
--- a/environment.yml
+++ b/environment.yml
@@ -47,7 +47,7 @@ dependencies:
       - pytest-cov==2.10.1
       - pytest-forked==1.3.0
       - pytest-xdist==1.34.0
-      - pytorch-lightning==1.2.8
+      - pytorch-lightning==1.3.4
       - rich==5.1.1
       - rpdb==0.1.6
       - runstats==1.8.0

From 327be48604ba90a32ff7ea04a1f8960ee5e5ef0d Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Wed, 9 Jun 2021 09:58:31 +0100
Subject: [PATCH 09/21] Try update env

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 6e9a85e5f..cc681d783 100644
--- a/environment.yml
+++ b/environment.yml
@@ -47,7 +47,7 @@ dependencies:
       - pytest-cov==2.10.1
       - pytest-forked==1.3.0
       - pytest-xdist==1.34.0
-      - pytorch-lightning==1.3.4
+      - pytorch-lightning==1.3.1
       - rich==5.1.1
       - rpdb==0.1.6
       - runstats==1.8.0

From 315a40bb75a2115ba5cb9b0ef80b6c3ba3b19e64 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Wed, 9 Jun 2021 11:43:23 +0100
Subject: [PATCH 10/21] Try update env

---
 InnerEye/ML/model_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
index 6c4810094..17a1fddaf 100644
--- a/InnerEye/ML/model_training.py
+++ b/InnerEye/ML/model_training.py
@@ -194,7 +194,7 @@ def create_lightning_trainer(container: LightningContainer,
                       sync_batchnorm=True,
                       terminate_on_nan=container.detect_anomaly,
                       resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
-                      plugins=plugins,
+                      plugins=[], # plugins
                       **kwargs)
     return trainer, storing_logger
 

From 063ae2a50063f78af4804f60d48f774d950c367e Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Wed, 9 Jun 2021 17:50:36 +0100
Subject: [PATCH 11/21] Restore

---
 InnerEye/ML/model_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
index 17a1fddaf..6c4810094 100644
--- a/InnerEye/ML/model_training.py
+++ b/InnerEye/ML/model_training.py
@@ -194,7 +194,7 @@ def create_lightning_trainer(container: LightningContainer,
                       sync_batchnorm=True,
                       terminate_on_nan=container.detect_anomaly,
                       resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
-                      plugins=[], # plugins
+                      plugins=plugins,
                       **kwargs)
     return trainer, storing_logger
 

From d954aef5c3bea5dfd97945eeadc811a6f4d6a252 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Wed, 9 Jun 2021 17:50:46 +0100
Subject: [PATCH 12/21] Restore

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index cc681d783..69ce488f3 100644
--- a/environment.yml
+++ b/environment.yml
@@ -47,7 +47,7 @@ dependencies:
       - pytest-cov==2.10.1
       - pytest-forked==1.3.0
       - pytest-xdist==1.34.0
-      - pytorch-lightning==1.3.1
+      - pytorch-lightning==1.2.4
       - rich==5.1.1
       - rpdb==0.1.6
       - runstats==1.8.0

From 75b9c3def4c94973cfc3fe26f1480dd2a9dfd504 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 09:19:38 +0100
Subject: [PATCH 13/21] Restore

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 69ce488f3..834011306 100644
--- a/environment.yml
+++ b/environment.yml
@@ -47,7 +47,7 @@ dependencies:
       - pytest-cov==2.10.1
       - pytest-forked==1.3.0
       - pytest-xdist==1.34.0
-      - pytorch-lightning==1.2.4
+      - pytorch-lightning==1.2.8
       - rich==5.1.1
       - rpdb==0.1.6
       - runstats==1.8.0

From a7480e7fc816b1589c6bdf4e9fc5b223b2f5008a Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 09:20:42 +0100
Subject: [PATCH 14/21] Restore

---
 Tests/AfterTraining/test_after_training.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 584a0629f..11631d382 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -227,14 +227,13 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
                  "--num_nodes", "2",
                  "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
                  "--num_epochs", "3",
-                 "--max_num_gpus", "1"
+                 "--max_num_gpus", "1",
+                 "--wait_for_completion", "True"
                  ]
     script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
     with mock.patch("sys.argv", [script] + args_list):
         main()
     run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
-    while run.status not in [RunStatus.COMPLETED, RunStatus.FAILED]:
-        time.sleep(10)
     assert run.status == RunStatus.COMPLETED
     files = run.get_file_names()
     # There are two nodes, so there should be one log file per node.

From b0f70b810fd47be9d4c8f0f9d094d1673ebd58c3 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 10:04:32 +0100
Subject: [PATCH 15/21] Try out other versions

---
 InnerEye/ML/run_ml.py                    |  3 +--
 InnerEye/ML/utils/checkpoint_handling.py |  8 +++++---
 InnerEye/ML/utils/run_recovery.py        | 14 ++++++++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
index bb509de4d..4043e6a02 100644
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@@ -222,8 +222,7 @@ def setup(self, use_mount_or_download_dataset: bool = True) -> None:
                                                     azure_config=self.azure_config,
                                                     project_root=self.project_root,
                                                     run_context=RUN_CONTEXT)
-        if is_global_rank_zero():
-            self.checkpoint_handler.download_recovery_checkpoints_or_weights()
+        self.checkpoint_handler.download_recovery_checkpoints_or_weights(is_global_rank_zero())
 
         # A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
         if isinstance(self.container, InnerEyeContainer):
diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py
index fe6b50686..18f74286a 100644
--- a/InnerEye/ML/utils/checkpoint_handling.py
+++ b/InnerEye/ML/utils/checkpoint_handling.py
@@ -57,7 +57,7 @@ def download_checkpoints_from_hyperdrive_child_runs(self, hyperdrive_parent_run:
             if not path.is_dir():
                 raise NotADirectoryError(f"Does not exist or is not a directory: {path}")
 
-    def download_recovery_checkpoints_or_weights(self) -> None:
+    def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) -> None:
         """
         Download checkpoints from a run recovery object or from a weights url. Set the checkpoints path based on the
         run_recovery_object, weights_url or local_weights_path.
@@ -65,7 +65,8 @@ def download_recovery_checkpoints_or_weights(self) -> None:
         """
         if self.azure_config.run_recovery_id:
             run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip())
-            self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover)
+            self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover,
+                                                                              only_return_path=is_global_rank_zero)
         else:
             self.run_recovery = None
 
@@ -73,7 +74,8 @@ def download_recovery_checkpoints_or_weights(self) -> None:
             run_to_recover = self.azure_config.fetch_run(self.azure_config.pretraining_run_recovery_id.strip())
             run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.output_params,
                                                                                 run_to_recover,
-                                                                                EXTRA_RUN_SUBFOLDER)
+                                                                                EXTRA_RUN_SUBFOLDER,
+                                                                                only_return_path=is_global_rank_zero)
             self.container.extra_downloaded_run_id = run_recovery_object
         else:
             self.container.extra_downloaded_run_id = None
diff --git a/InnerEye/ML/utils/run_recovery.py b/InnerEye/ML/utils/run_recovery.py
index f7acea153..3b8181198 100644
--- a/InnerEye/ML/utils/run_recovery.py
+++ b/InnerEye/ML/utils/run_recovery.py
@@ -63,7 +63,8 @@ def download_best_checkpoints_from_child_runs(config: OutputParams, run: Run) ->
 
     @staticmethod
     def download_all_checkpoints_from_run(config: OutputParams, run: Run,
-                                          subfolder: Optional[str] = None) -> RunRecovery:
+                                          subfolder: Optional[str] = None,
+                                          only_return_path: bool = False) -> RunRecovery:
         """
         Downloads all checkpoints of the provided run inside the checkpoints folder.
         :param config: Model related configs.
@@ -77,11 +78,12 @@ def download_all_checkpoints_from_run(config: OutputParams, run: Run,
 
         destination_folder = config.checkpoint_folder / subfolder if subfolder else config.checkpoint_folder
 
-        download_outputs_from_run(
-            blobs_path=Path(CHECKPOINT_FOLDER),
-            destination=destination_folder,
-            run=run
-        )
+        if not only_return_path:
+            download_outputs_from_run(
+                blobs_path=Path(CHECKPOINT_FOLDER),
+                destination=destination_folder,
+                run=run
+            )
         time.sleep(60)  # Needed because AML is not fast enough to download
         return RunRecovery(checkpoints_roots=[destination_folder])
 

From 4f58faa1de63335acec5e057c15e353cd3c60dea Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 12:05:07 +0100
Subject: [PATCH 16/21] This fixes it

---
 InnerEye/ML/utils/checkpoint_handling.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py
index 18f74286a..84d6f63d4 100644
--- a/InnerEye/ML/utils/checkpoint_handling.py
+++ b/InnerEye/ML/utils/checkpoint_handling.py
@@ -63,10 +63,11 @@ def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) ->
         run_recovery_object, weights_url or local_weights_path.
         This is called at the start of training.
         """
+        only_return_path = not is_global_rank_zero
         if self.azure_config.run_recovery_id:
             run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip())
             self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover,
-                                                                              only_return_path=is_global_rank_zero)
+                                                                              only_return_path=only_return_path)
         else:
             self.run_recovery = None
 
@@ -75,7 +76,7 @@ def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) ->
             run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.output_params,
                                                                                 run_to_recover,
                                                                                 EXTRA_RUN_SUBFOLDER,
-                                                                                only_return_path=is_global_rank_zero)
+                                                                                only_return_path=only_return_path)
             self.container.extra_downloaded_run_id = run_recovery_object
         else:
             self.container.extra_downloaded_run_id = None

From c675bbe9fcd3bbdd7de63b0ed5fd5ad99e4d2385 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 12:07:16 +0100
Subject: [PATCH 17/21] Fix it

---
 InnerEye/ML/run_ml.py                    | 2 +-
 InnerEye/ML/utils/checkpoint_handling.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
index 4043e6a02..743c9256b 100644
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@@ -222,7 +222,7 @@ def setup(self, use_mount_or_download_dataset: bool = True) -> None:
                                                     azure_config=self.azure_config,
                                                     project_root=self.project_root,
                                                     run_context=RUN_CONTEXT)
-        self.checkpoint_handler.download_recovery_checkpoints_or_weights(is_global_rank_zero())
+        self.checkpoint_handler.download_recovery_checkpoints_or_weights(only_return_path=not is_global_rank_zero())
 
         # A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
         if isinstance(self.container, InnerEyeContainer):
diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py
index 84d6f63d4..fdd3e3240 100644
--- a/InnerEye/ML/utils/checkpoint_handling.py
+++ b/InnerEye/ML/utils/checkpoint_handling.py
@@ -57,13 +57,12 @@ def download_checkpoints_from_hyperdrive_child_runs(self, hyperdrive_parent_run:
             if not path.is_dir():
                 raise NotADirectoryError(f"Does not exist or is not a directory: {path}")
 
-    def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) -> None:
+    def download_recovery_checkpoints_or_weights(self, only_return_path: bool = False) -> None:
         """
         Download checkpoints from a run recovery object or from a weights url. Set the checkpoints path based on the
         run_recovery_object, weights_url or local_weights_path.
         This is called at the start of training.
         """
-        only_return_path = not is_global_rank_zero
         if self.azure_config.run_recovery_id:
             run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip())
             self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover,

From d8955b8fd0773ecaa98d0e2fded590a8ca986807 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 12:11:21 +0100
Subject: [PATCH 18/21] Clean test up

---
 Tests/AfterTraining/test_after_training.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 11631d382..437315e8a 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -226,7 +226,7 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
                  "--azureml", "True",
                  "--num_nodes", "2",
                  "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
-                 "--num_epochs", "3",
+                 "--num_epochs", "4",
                  "--max_num_gpus", "1",
                  "--wait_for_completion", "True"
                  ]
@@ -250,6 +250,8 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
     log1_txt = log1.read_text()
     assert "Downloading multiple files from run" in log0_txt
     assert "Downloading multiple files from run" not in log1_txt
+    assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt
+    assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log1_txt
 
 def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool:
     return f"{CROSSVAL_RESULTS_FOLDER}/{split}/{mode.value}/metrics.csv" in available_files

From cb2da196dd2a86babae904759eb8c99f139567cf Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 12:14:32 +0100
Subject: [PATCH 19/21] Clean test up

---
 Tests/AfterTraining/test_after_training.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 437315e8a..b139f14fe 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -13,7 +13,6 @@
 import os
 import shutil
 import sys
-import time
 from pathlib import Path
 from typing import List
 from unittest import mock

From e5bb2d30e1c9ca25072aa5241bcaeea17c1cbab5 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 12:20:45 +0100
Subject: [PATCH 20/21] Docs and mypy

---
 InnerEye/ML/utils/checkpoint_handling.py   | 3 +++
 InnerEye/ML/utils/run_recovery.py          | 3 +++
 Tests/AfterTraining/test_after_training.py | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py
index fdd3e3240..7ddda99c9 100644
--- a/InnerEye/ML/utils/checkpoint_handling.py
+++ b/InnerEye/ML/utils/checkpoint_handling.py
@@ -62,6 +62,9 @@ def download_recovery_checkpoints_or_weights(self, only_return_path: bool = Fals
         Download checkpoints from a run recovery object or from a weights url. Set the checkpoints path based on the
         run_recovery_object, weights_url or local_weights_path.
         This is called at the start of training.
+        :param: only_return_path: if True, return a RunRecovery object with the path to the checkpoint without actually
+        downloading the checkpoints. This is useful to avoid duplicating checkpoint download when running on multiple
+        nodes. If False, return the RunRecovery object and download the checkpoint to disk.
         """
         if self.azure_config.run_recovery_id:
             run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip())
diff --git a/InnerEye/ML/utils/run_recovery.py b/InnerEye/ML/utils/run_recovery.py
index 3b8181198..bb92cdabe 100644
--- a/InnerEye/ML/utils/run_recovery.py
+++ b/InnerEye/ML/utils/run_recovery.py
@@ -71,6 +71,9 @@ def download_all_checkpoints_from_run(config: OutputParams, run: Run,
         :param run: Run whose checkpoints should be recovered
         :param subfolder: optional subfolder name, if provided the checkpoints will be downloaded to
         CHECKPOINT_FOLDER / subfolder. If None, the checkpoint are downloaded to CHECKPOINT_FOLDER of the current run.
+        :param: only_return_path: if True, return a RunRecovery object with the path to the checkpoint without actually
+        downloading the checkpoints. This is useful to avoid duplicating checkpoint download when running on multiple
+        nodes. If False, return the RunRecovery object and download the checkpoint to disk.
         :return: run recovery information
         """
         if fetch_child_runs(run):
diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index b139f14fe..2e51c72dd 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -220,7 +220,7 @@ def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderFor
     assert seg_path.exists(), f"Result file {seg_path} was not created"
 
 @pytest.mark.after_training_2node
-def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests):
+def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
     args_list = ["--model", "BasicModel2EpochsMoreData",
                  "--azureml", "True",
                  "--num_nodes", "2",

From fdc573732309965373ac6f952add50c91465f790 Mon Sep 17 00:00:00 2001
From: melanibe <mebernha@microsoft.com>
Date: Thu, 10 Jun 2021 13:14:36 +0100
Subject: [PATCH 21/21] Docs and mypy

---
 Tests/AfterTraining/test_after_training.py | 66 +++++++++++-----------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 2e51c72dd..98271b5d0 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -219,38 +219,6 @@ def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderFor
     submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
     assert seg_path.exists(), f"Result file {seg_path} was not created"
 
-@pytest.mark.after_training_2node
-def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
-    args_list = ["--model", "BasicModel2EpochsMoreData",
-                 "--azureml", "True",
-                 "--num_nodes", "2",
-                 "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
-                 "--num_epochs", "4",
-                 "--max_num_gpus", "1",
-                 "--wait_for_completion", "True"
-                 ]
-    script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
-    with mock.patch("sys.argv", [script] + args_list):
-        main()
-    run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
-    assert run.status == RunStatus.COMPLETED
-    files = run.get_file_names()
-    # There are two nodes, so there should be one log file per node.
-    log0_path = "azureml-logs/70_driver_log_0.txt"
-    log1_path = "azureml-logs/70_driver_log_1.txt"
-    assert log0_path in files, "Node rank 0 log file is missing"
-    assert log1_path in files, "Node rank 1 log file is missing"
-    # Download both log files and check their contents
-    log0 = test_output_dirs.root_dir / log0_path
-    log1 = test_output_dirs.root_dir / log1_path
-    run.download_file(log0_path, output_file_path=str(log0))
-    run.download_file(log1_path, output_file_path=str(log1))
-    log0_txt = log0.read_text()
-    log1_txt = log1.read_text()
-    assert "Downloading multiple files from run" in log0_txt
-    assert "Downloading multiple files from run" not in log1_txt
-    assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt
-    assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log1_txt
 
 def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool:
     return f"{CROSSVAL_RESULTS_FOLDER}/{split}/{mode.value}/metrics.csv" in available_files
@@ -385,3 +353,37 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
     assert "initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/4" in log0_txt
     assert "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
     assert "initializing ddp: GLOBAL_RANK: 3, MEMBER: 4/4" in log1_txt
+
+
+@pytest.mark.after_training_2node
+def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
+    args_list = ["--model", "BasicModel2EpochsMoreData",
+                 "--azureml", "True",
+                 "--num_nodes", "2",
+                 "--run_recovery_id",
+                 str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)),
+                 "--num_epochs", "4",
+                 "--wait_for_completion", "True"
+                 ]
+    script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
+    with mock.patch("sys.argv", [script] + args_list):
+        main()
+    run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
+    assert run.status == RunStatus.COMPLETED
+    files = run.get_file_names()
+    # There are two nodes, so there should be one log file per node.
+    log0_path = "azureml-logs/70_driver_log_0.txt"
+    log1_path = "azureml-logs/70_driver_log_1.txt"
+    assert log0_path in files, "Node rank 0 log file is missing"
+    assert log1_path in files, "Node rank 1 log file is missing"
+    # Download both log files and check their contents
+    log0 = test_output_dirs.root_dir / log0_path
+    log1 = test_output_dirs.root_dir / log1_path
+    run.download_file(log0_path, output_file_path=str(log0))
+    run.download_file(log1_path, output_file_path=str(log1))
+    log0_txt = log0.read_text()
+    log1_txt = log1.read_text()
+    assert "Downloading multiple files from run" in log0_txt
+    assert "Downloading multiple files from run" not in log1_txt
+    assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt
+    assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log1_txt