From 8348bbf4bc96e152b3c2fee08f7c33d8acf22a9d Mon Sep 17 00:00:00 2001 From: melanibe Date: Mon, 7 Jun 2021 10:29:41 +0100 Subject: [PATCH 01/21] Only download checkpoints on node 0 --- InnerEye/ML/run_ml.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py index 94c5f7bd8..bb509de4d 100644 --- a/InnerEye/ML/run_ml.py +++ b/InnerEye/ML/run_ml.py @@ -44,7 +44,7 @@ from InnerEye.ML.model_config_base import ModelConfigBase from InnerEye.ML.model_inference_config import ModelInferenceConfig from InnerEye.ML.model_testing import model_test -from InnerEye.ML.model_training import create_lightning_trainer, model_train +from InnerEye.ML.model_training import create_lightning_trainer, is_global_rank_zero, model_train from InnerEye.ML.reports.notebook_report import generate_classification_crossval_notebook, \ generate_classification_multilabel_notebook, generate_classification_notebook, generate_segmentation_notebook, \ get_ipynb_report_name, reports_folder @@ -222,7 +222,8 @@ def setup(self, use_mount_or_download_dataset: bool = True) -> None: azure_config=self.azure_config, project_root=self.project_root, run_context=RUN_CONTEXT) - self.checkpoint_handler.download_recovery_checkpoints_or_weights() + if is_global_rank_zero(): + self.checkpoint_handler.download_recovery_checkpoints_or_weights() # A lot of the code for the built-in InnerEye models expects the output paths directly in the config files. if isinstance(self.container, InnerEyeContainer): From 0bdaf68166159d528b7993a6def8b389f8a6bf57 Mon Sep 17 00:00:00 2001 From: melanibe Date: Mon, 7 Jun 2021 13:21:34 +0100 Subject: [PATCH 02/21] Try out new test --- Tests/AfterTraining/test_after_training.py | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 3975647db..1250be9d2 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -15,6 +15,7 @@ import sys from pathlib import Path from typing import List +from unittest import mock import numpy as np import pytest @@ -38,6 +39,7 @@ from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, ModelCategory from InnerEye.ML.model_inference_config import read_model_inference_config from InnerEye.ML.reports.notebook_report import get_html_report_name +from InnerEye.ML.runner import main from InnerEye.ML.utils.config_loader import ModelConfigLoader from InnerEye.ML.utils.image_util import get_unit_image_header from InnerEye.ML.utils.io_util import zip_random_dicom_series @@ -217,6 +219,34 @@ def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderFor submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory()) assert seg_path.exists(), f"Result file {seg_path} was not created" +@pytest.mark.after_training_2node +def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): + args_list = ["--model", "BasicModel2EpochsMoreData", + "--azureml", "True", + "--num_nodes", "2", + "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), + "--num_epochs", "3", + ] + with mock.patch("sys.argv", [""] + args_list): + main() + print(get_most_recent_run_id()) + run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN) + assert run.status == RunStatus.COMPLETED + files = run.get_file_names() + # There are two nodes, so there should be one log file per node. + log0_path = "azureml-logs/70_driver_log_0.txt" + log1_path = "azureml-logs/70_driver_log_1.txt" + assert log0_path in files, "Node rank 0 log file is missing" + assert log1_path in files, "Node rank 1 log file is missing" + # Download both log files and check their contents + log0 = test_output_dirs.root_dir / log0_path + log1 = test_output_dirs.root_dir / log1_path + run.download_file(log0_path, output_file_path=str(log0)) + run.download_file(log1_path, output_file_path=str(log1)) + log0_txt = log0.read_text() + log1_txt = log1.read_text() + assert "Downloading multiple files from run" in log0_txt + assert "Downloading multiple files from run" not in log1_txt def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool: return f"{CROSSVAL_RESULTS_FOLDER}/{split}/{mode.value}/metrics.csv" in available_files From 82631c5e9a9bbb24512c6d0f50bc4b9245361c8e Mon Sep 17 00:00:00 2001 From: melanibe Date: Mon, 7 Jun 2021 14:04:23 +0100 Subject: [PATCH 03/21] Try out new test --- Tests/AfterTraining/test_after_training.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 1250be9d2..357f0a0d0 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -30,7 +30,7 @@ from InnerEye.Common.common_util import CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, get_best_epoch_results_path from InnerEye.Common.fixed_paths import DEFAULT_AML_LOGS_DIR, DEFAULT_RESULT_IMAGE_NAME, \ DEFAULT_RESULT_ZIP_DICOM_NAME, \ - PYTHON_ENVIRONMENT_NAME + PYTHON_ENVIRONMENT_NAME, repository_root_directory from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path from InnerEye.Common.output_directories import OutputFolderForTests from InnerEye.Common.spawn_subprocess import spawn_and_monitor_subprocess @@ -227,7 +227,8 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), "--num_epochs", "3", ] - with mock.patch("sys.argv", [""] + args_list): + script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") + with mock.patch("sys.argv", [script] + args_list): main() print(get_most_recent_run_id()) run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN) From acf6268be4e07511e73eee1b01eb3d3970c1e67b Mon Sep 17 00:00:00 2001 From: melanibe Date: Mon, 7 Jun 2021 16:06:07 +0100 Subject: [PATCH 04/21] Try out new test --- Tests/AfterTraining/test_after_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 357f0a0d0..9bc91fc6f 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -226,6 +226,7 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): "--num_nodes", "2", "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), "--num_epochs", "3", + "--wait_for_completion", "True" ] script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") with mock.patch("sys.argv", [script] + args_list): From d2016843eef5a6c872715c9fa993ee503c132883 Mon Sep 17 00:00:00 2001 From: melanibe Date: Mon, 7 Jun 2021 17:29:13 +0100 Subject: [PATCH 05/21] Try out new test --- Tests/AfterTraining/test_after_training.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 9bc91fc6f..494a2414a 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -13,6 +13,7 @@ import os import shutil import sys +import time from pathlib import Path from typing import List from unittest import mock @@ -225,14 +226,14 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): "--azureml", "True", "--num_nodes", "2", "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), - "--num_epochs", "3", - "--wait_for_completion", "True" + "--num_epochs", "3" ] script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") with mock.patch("sys.argv", [script] + args_list): main() - print(get_most_recent_run_id()) run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN) + while run.status not in [RunStatus.COMPLETED, RunStatus.FAILED]: + time.sleep(10) assert run.status == RunStatus.COMPLETED files = run.get_file_names() # There are two nodes, so there should be one log file per node. From 69798e594f5cdc66278c7c8c73ec73793b1b0719 Mon Sep 17 00:00:00 2001 From: melanibe Date: Mon, 7 Jun 2021 18:43:53 +0100 Subject: [PATCH 06/21] Try out new test --- Tests/AfterTraining/test_after_training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 494a2414a..151b2ecc0 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -226,7 +226,8 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): "--azureml", "True", "--num_nodes", "2", "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), - "--num_epochs", "3" + "--num_epochs", "3", + "--max_nums_gpus", "1" ] script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") with mock.patch("sys.argv", [script] + args_list): From b3e7c661bc17b35a8fd5f8ad65739279676ef43d Mon Sep 17 00:00:00 2001 From: melanibe Date: Tue, 8 Jun 2021 09:09:20 +0100 Subject: [PATCH 07/21] Try out new test --- Tests/AfterTraining/test_after_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 151b2ecc0..584a0629f 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -227,7 +227,7 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): "--num_nodes", "2", "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), "--num_epochs", "3", - "--max_nums_gpus", "1" + "--max_num_gpus", "1" ] script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") with mock.patch("sys.argv", [script] + args_list): From f54fe2852fbf74408a332b50e35bf5c260c1f6d7 Mon Sep 17 00:00:00 2001 From: melanibe Date: Wed, 9 Jun 2021 09:03:40 +0100 Subject: [PATCH 08/21] Try update env --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 834011306..6e9a85e5f 100644 --- a/environment.yml +++ b/environment.yml @@ -47,7 +47,7 @@ dependencies: - pytest-cov==2.10.1 - pytest-forked==1.3.0 - pytest-xdist==1.34.0 - - pytorch-lightning==1.2.8 + - pytorch-lightning==1.3.4 - rich==5.1.1 - rpdb==0.1.6 - runstats==1.8.0 From 327be48604ba90a32ff7ea04a1f8960ee5e5ef0d Mon Sep 17 00:00:00 2001 From: melanibe Date: Wed, 9 Jun 2021 09:58:31 +0100 Subject: [PATCH 09/21] Try update env --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 6e9a85e5f..cc681d783 100644 --- a/environment.yml +++ b/environment.yml @@ -47,7 +47,7 @@ dependencies: - pytest-cov==2.10.1 - pytest-forked==1.3.0 - pytest-xdist==1.34.0 - - pytorch-lightning==1.3.4 + - pytorch-lightning==1.3.1 - rich==5.1.1 - rpdb==0.1.6 - runstats==1.8.0 From 315a40bb75a2115ba5cb9b0ef80b6c3ba3b19e64 Mon Sep 17 00:00:00 2001 From: melanibe Date: Wed, 9 Jun 2021 11:43:23 +0100 Subject: [PATCH 10/21] Try update env --- InnerEye/ML/model_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py index 6c4810094..17a1fddaf 100644 --- a/InnerEye/ML/model_training.py +++ b/InnerEye/ML/model_training.py @@ -194,7 +194,7 @@ def create_lightning_trainer(container: LightningContainer, sync_batchnorm=True, terminate_on_nan=container.detect_anomaly, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None, - plugins=plugins, + plugins=[], # plugins **kwargs) return trainer, storing_logger From 063ae2a50063f78af4804f60d48f774d950c367e Mon Sep 17 00:00:00 2001 From: melanibe Date: Wed, 9 Jun 2021 17:50:36 +0100 Subject: [PATCH 11/21] Restore --- InnerEye/ML/model_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py index 17a1fddaf..6c4810094 100644 --- a/InnerEye/ML/model_training.py +++ b/InnerEye/ML/model_training.py @@ -194,7 +194,7 @@ def create_lightning_trainer(container: LightningContainer, sync_batchnorm=True, terminate_on_nan=container.detect_anomaly, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None, - plugins=[], # plugins + plugins=plugins, **kwargs) return trainer, storing_logger From d954aef5c3bea5dfd97945eeadc811a6f4d6a252 Mon Sep 17 00:00:00 2001 From: melanibe Date: Wed, 9 Jun 2021 17:50:46 +0100 Subject: [PATCH 12/21] Restore --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index cc681d783..69ce488f3 100644 --- a/environment.yml +++ b/environment.yml @@ -47,7 +47,7 @@ dependencies: - pytest-cov==2.10.1 - pytest-forked==1.3.0 - pytest-xdist==1.34.0 - - pytorch-lightning==1.3.1 + - pytorch-lightning==1.2.4 - rich==5.1.1 - rpdb==0.1.6 - runstats==1.8.0 From 75b9c3def4c94973cfc3fe26f1480dd2a9dfd504 Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 09:19:38 +0100 Subject: [PATCH 13/21] Restore --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 69ce488f3..834011306 100644 --- a/environment.yml +++ b/environment.yml @@ -47,7 +47,7 @@ dependencies: - pytest-cov==2.10.1 - pytest-forked==1.3.0 - pytest-xdist==1.34.0 - - pytorch-lightning==1.2.4 + - pytorch-lightning==1.2.8 - rich==5.1.1 - rpdb==0.1.6 - runstats==1.8.0 From a7480e7fc816b1589c6bdf4e9fc5b223b2f5008a Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 09:20:42 +0100 Subject: [PATCH 14/21] Restore --- Tests/AfterTraining/test_after_training.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 584a0629f..11631d382 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -227,14 +227,13 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): "--num_nodes", "2", "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), "--num_epochs", "3", - "--max_num_gpus", "1" + "--max_num_gpus", "1", + "--wait_for_completion", "True" ] script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") with mock.patch("sys.argv", [script] + args_list): main() run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN) - while run.status not in [RunStatus.COMPLETED, RunStatus.FAILED]: - time.sleep(10) assert run.status == RunStatus.COMPLETED files = run.get_file_names() # There are two nodes, so there should be one log file per node. From b0f70b810fd47be9d4c8f0f9d094d1673ebd58c3 Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 10:04:32 +0100 Subject: [PATCH 15/21] Try out other versions --- InnerEye/ML/run_ml.py | 3 +-- InnerEye/ML/utils/checkpoint_handling.py | 8 +++++--- InnerEye/ML/utils/run_recovery.py | 14 ++++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py index bb509de4d..4043e6a02 100644 --- a/InnerEye/ML/run_ml.py +++ b/InnerEye/ML/run_ml.py @@ -222,8 +222,7 @@ def setup(self, use_mount_or_download_dataset: bool = True) -> None: azure_config=self.azure_config, project_root=self.project_root, run_context=RUN_CONTEXT) - if is_global_rank_zero(): - self.checkpoint_handler.download_recovery_checkpoints_or_weights() + self.checkpoint_handler.download_recovery_checkpoints_or_weights(is_global_rank_zero()) # A lot of the code for the built-in InnerEye models expects the output paths directly in the config files. if isinstance(self.container, InnerEyeContainer): diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py index fe6b50686..18f74286a 100644 --- a/InnerEye/ML/utils/checkpoint_handling.py +++ b/InnerEye/ML/utils/checkpoint_handling.py @@ -57,7 +57,7 @@ def download_checkpoints_from_hyperdrive_child_runs(self, hyperdrive_parent_run: if not path.is_dir(): raise NotADirectoryError(f"Does not exist or is not a directory: {path}") - def download_recovery_checkpoints_or_weights(self) -> None: + def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) -> None: """ Download checkpoints from a run recovery object or from a weights url. Set the checkpoints path based on the run_recovery_object, weights_url or local_weights_path. @@ -65,7 +65,8 @@ def download_recovery_checkpoints_or_weights(self) -> None: """ if self.azure_config.run_recovery_id: run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip()) - self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover) + self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover, + only_return_path=is_global_rank_zero) else: self.run_recovery = None @@ -73,7 +74,8 @@ def download_recovery_checkpoints_or_weights(self) -> None: run_to_recover = self.azure_config.fetch_run(self.azure_config.pretraining_run_recovery_id.strip()) run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover, - EXTRA_RUN_SUBFOLDER) + EXTRA_RUN_SUBFOLDER, + only_return_path=is_global_rank_zero) self.container.extra_downloaded_run_id = run_recovery_object else: self.container.extra_downloaded_run_id = None diff --git a/InnerEye/ML/utils/run_recovery.py b/InnerEye/ML/utils/run_recovery.py index f7acea153..3b8181198 100644 --- a/InnerEye/ML/utils/run_recovery.py +++ b/InnerEye/ML/utils/run_recovery.py @@ -63,7 +63,8 @@ def download_best_checkpoints_from_child_runs(config: OutputParams, run: Run) -> @staticmethod def download_all_checkpoints_from_run(config: OutputParams, run: Run, - subfolder: Optional[str] = None) -> RunRecovery: + subfolder: Optional[str] = None, + only_return_path: bool = False) -> RunRecovery: """ Downloads all checkpoints of the provided run inside the checkpoints folder. :param config: Model related configs. @@ -77,11 +78,12 @@ def download_all_checkpoints_from_run(config: OutputParams, run: Run, destination_folder = config.checkpoint_folder / subfolder if subfolder else config.checkpoint_folder - download_outputs_from_run( - blobs_path=Path(CHECKPOINT_FOLDER), - destination=destination_folder, - run=run - ) + if not only_return_path: + download_outputs_from_run( + blobs_path=Path(CHECKPOINT_FOLDER), + destination=destination_folder, + run=run + ) time.sleep(60) # Needed because AML is not fast enough to download return RunRecovery(checkpoints_roots=[destination_folder]) From 4f58faa1de63335acec5e057c15e353cd3c60dea Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 12:05:07 +0100 Subject: [PATCH 16/21] This fixes it --- InnerEye/ML/utils/checkpoint_handling.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py index 18f74286a..84d6f63d4 100644 --- a/InnerEye/ML/utils/checkpoint_handling.py +++ b/InnerEye/ML/utils/checkpoint_handling.py @@ -63,10 +63,11 @@ def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) -> run_recovery_object, weights_url or local_weights_path. This is called at the start of training. """ + only_return_path = not is_global_rank_zero if self.azure_config.run_recovery_id: run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip()) self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover, - only_return_path=is_global_rank_zero) + only_return_path=only_return_path) else: self.run_recovery = None @@ -75,7 +76,7 @@ def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) -> run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover, EXTRA_RUN_SUBFOLDER, - only_return_path=is_global_rank_zero) + only_return_path=only_return_path) self.container.extra_downloaded_run_id = run_recovery_object else: self.container.extra_downloaded_run_id = None From c675bbe9fcd3bbdd7de63b0ed5fd5ad99e4d2385 Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 12:07:16 +0100 Subject: [PATCH 17/21] Fix it --- InnerEye/ML/run_ml.py | 2 +- InnerEye/ML/utils/checkpoint_handling.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py index 4043e6a02..743c9256b 100644 --- a/InnerEye/ML/run_ml.py +++ b/InnerEye/ML/run_ml.py @@ -222,7 +222,7 @@ def setup(self, use_mount_or_download_dataset: bool = True) -> None: azure_config=self.azure_config, project_root=self.project_root, run_context=RUN_CONTEXT) - self.checkpoint_handler.download_recovery_checkpoints_or_weights(is_global_rank_zero()) + self.checkpoint_handler.download_recovery_checkpoints_or_weights(only_return_path=not is_global_rank_zero()) # A lot of the code for the built-in InnerEye models expects the output paths directly in the config files. if isinstance(self.container, InnerEyeContainer): diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py index 84d6f63d4..fdd3e3240 100644 --- a/InnerEye/ML/utils/checkpoint_handling.py +++ b/InnerEye/ML/utils/checkpoint_handling.py @@ -57,13 +57,12 @@ def download_checkpoints_from_hyperdrive_child_runs(self, hyperdrive_parent_run: if not path.is_dir(): raise NotADirectoryError(f"Does not exist or is not a directory: {path}") - def download_recovery_checkpoints_or_weights(self, is_global_rank_zero: bool) -> None: + def download_recovery_checkpoints_or_weights(self, only_return_path: bool = False) -> None: """ Download checkpoints from a run recovery object or from a weights url. Set the checkpoints path based on the run_recovery_object, weights_url or local_weights_path. This is called at the start of training. """ - only_return_path = not is_global_rank_zero if self.azure_config.run_recovery_id: run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip()) self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover, From d8955b8fd0773ecaa98d0e2fded590a8ca986807 Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 12:11:21 +0100 Subject: [PATCH 18/21] Clean test up --- Tests/AfterTraining/test_after_training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 11631d382..437315e8a 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -226,7 +226,7 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): "--azureml", "True", "--num_nodes", "2", "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), - "--num_epochs", "3", + "--num_epochs", "4", "--max_num_gpus", "1", "--wait_for_completion", "True" ] @@ -250,6 +250,8 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): log1_txt = log1.read_text() assert "Downloading multiple files from run" in log0_txt assert "Downloading multiple files from run" not in log1_txt + assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt + assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log1_txt def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool: return f"{CROSSVAL_RESULTS_FOLDER}/{split}/{mode.value}/metrics.csv" in available_files From cb2da196dd2a86babae904759eb8c99f139567cf Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 12:14:32 +0100 Subject: [PATCH 19/21] Clean test up --- Tests/AfterTraining/test_after_training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 437315e8a..b139f14fe 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -13,7 +13,6 @@ import os import shutil import sys -import time from pathlib import Path from typing import List from unittest import mock From e5bb2d30e1c9ca25072aa5241bcaeea17c1cbab5 Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 12:20:45 +0100 Subject: [PATCH 20/21] Docs and mypy --- InnerEye/ML/utils/checkpoint_handling.py | 3 +++ InnerEye/ML/utils/run_recovery.py | 3 +++ Tests/AfterTraining/test_after_training.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py index fdd3e3240..7ddda99c9 100644 --- a/InnerEye/ML/utils/checkpoint_handling.py +++ b/InnerEye/ML/utils/checkpoint_handling.py @@ -62,6 +62,9 @@ def download_recovery_checkpoints_or_weights(self, only_return_path: bool = Fals Download checkpoints from a run recovery object or from a weights url. Set the checkpoints path based on the run_recovery_object, weights_url or local_weights_path. This is called at the start of training. + :param: only_return_path: if True, return a RunRecovery object with the path to the checkpoint without actually + downloading the checkpoints. This is useful to avoid duplicating checkpoint download when running on multiple + nodes. If False, return the RunRecovery object and download the checkpoint to disk. """ if self.azure_config.run_recovery_id: run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip()) diff --git a/InnerEye/ML/utils/run_recovery.py b/InnerEye/ML/utils/run_recovery.py index 3b8181198..bb92cdabe 100644 --- a/InnerEye/ML/utils/run_recovery.py +++ b/InnerEye/ML/utils/run_recovery.py @@ -71,6 +71,9 @@ def download_all_checkpoints_from_run(config: OutputParams, run: Run, :param run: Run whose checkpoints should be recovered :param subfolder: optional subfolder name, if provided the checkpoints will be downloaded to CHECKPOINT_FOLDER / subfolder. If None, the checkpoint are downloaded to CHECKPOINT_FOLDER of the current run. + :param: only_return_path: if True, return a RunRecovery object with the path to the checkpoint without actually + downloading the checkpoints. This is useful to avoid duplicating checkpoint download when running on multiple + nodes. If False, return the RunRecovery object and download the checkpoint to disk. :return: run recovery information """ if fetch_child_runs(run): diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index b139f14fe..2e51c72dd 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -220,7 +220,7 @@ def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderFor assert seg_path.exists(), f"Result file {seg_path} was not created" @pytest.mark.after_training_2node -def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests): +def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None: args_list = ["--model", "BasicModel2EpochsMoreData", "--azureml", "True", "--num_nodes", "2", From fdc573732309965373ac6f952add50c91465f790 Mon Sep 17 00:00:00 2001 From: melanibe Date: Thu, 10 Jun 2021 13:14:36 +0100 Subject: [PATCH 21/21] Docs and mypy --- Tests/AfterTraining/test_after_training.py | 66 +++++++++++----------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 2e51c72dd..98271b5d0 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -219,38 +219,6 @@ def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderFor submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory()) assert seg_path.exists(), f"Result file {seg_path} was not created" -@pytest.mark.after_training_2node -def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None: - args_list = ["--model", "BasicModel2EpochsMoreData", - "--azureml", "True", - "--num_nodes", "2", - "--run_recovery_id", str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), - "--num_epochs", "4", - "--max_num_gpus", "1", - "--wait_for_completion", "True" - ] - script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") - with mock.patch("sys.argv", [script] + args_list): - main() - run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN) - assert run.status == RunStatus.COMPLETED - files = run.get_file_names() - # There are two nodes, so there should be one log file per node. - log0_path = "azureml-logs/70_driver_log_0.txt" - log1_path = "azureml-logs/70_driver_log_1.txt" - assert log0_path in files, "Node rank 0 log file is missing" - assert log1_path in files, "Node rank 1 log file is missing" - # Download both log files and check their contents - log0 = test_output_dirs.root_dir / log0_path - log1 = test_output_dirs.root_dir / log1_path - run.download_file(log0_path, output_file_path=str(log0)) - run.download_file(log1_path, output_file_path=str(log1)) - log0_txt = log0.read_text() - log1_txt = log1.read_text() - assert "Downloading multiple files from run" in log0_txt - assert "Downloading multiple files from run" not in log1_txt - assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt - assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log1_txt def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool: return f"{CROSSVAL_RESULTS_FOLDER}/{split}/{mode.value}/metrics.csv" in available_files @@ -385,3 +353,37 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None: assert "initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/4" in log0_txt assert "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt assert "initializing ddp: GLOBAL_RANK: 3, MEMBER: 4/4" in log1_txt + + +@pytest.mark.after_training_2node +def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None: + args_list = ["--model", "BasicModel2EpochsMoreData", + "--azureml", "True", + "--num_nodes", "2", + "--run_recovery_id", + str(get_most_recent_run_id(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)), + "--num_epochs", "4", + "--wait_for_completion", "True" + ] + script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py") + with mock.patch("sys.argv", [script] + args_list): + main() + run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN) + assert run.status == RunStatus.COMPLETED + files = run.get_file_names() + # There are two nodes, so there should be one log file per node. + log0_path = "azureml-logs/70_driver_log_0.txt" + log1_path = "azureml-logs/70_driver_log_1.txt" + assert log0_path in files, "Node rank 0 log file is missing" + assert log1_path in files, "Node rank 1 log file is missing" + # Download both log files and check their contents + log0 = test_output_dirs.root_dir / log0_path + log1 = test_output_dirs.root_dir / log1_path + run.download_file(log0_path, output_file_path=str(log0)) + run.download_file(log1_path, output_file_path=str(log1)) + log0_txt = log0.read_text() + log1_txt = log1.read_text() + assert "Downloading multiple files from run" in log0_txt + assert "Downloading multiple files from run" not in log1_txt + assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt + assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log1_txt