diff --git a/.github/workflows/linting_and_hello_world.yml b/.github/workflows/linting_and_hello_world.yml
index de98bbd5a..9ed6405bd 100644
--- a/.github/workflows/linting_and_hello_world.yml
+++ b/.github/workflows/linting_and_hello_world.yml
@@ -13,6 +13,7 @@ jobs:
- uses: actions/checkout@v2
with:
lfs: true
+ submodules: true
- name: flake8
run: |
@@ -58,7 +59,8 @@ jobs:
- uses: actions/checkout@v2
with:
lfs: true
-
+ submodules: true
+
- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: InnerEye
diff --git a/.gitignore b/.gitignore
index eeed8fae8..413cbe3fc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,6 +131,7 @@ dmypy.json
# other
.vscode/
+.devcontainer/
/InnerEye/ML/src/aml_config
*.exe
diff --git a/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml b/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml
index 8e58a0a7c..28140e00e 100644
--- a/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml
+++ b/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml
@@ -11,8 +11,9 @@
+
-
+
diff --git a/.idea/runConfigurations/pytest_all_simple_tests.xml b/.idea/runConfigurations/pytest_all_simple_tests.xml
new file mode 100644
index 000000000..e5e3a6a01
--- /dev/null
+++ b/.idea/runConfigurations/pytest_all_simple_tests.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dccd68e2b..141d7f0e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,8 @@ jobs that run in AzureML.
- ([#533](https://github.com/microsoft/InnerEye-DeepLearning/pull/533)) Better defaults for inference on ensemble children.
- ([#536](https://github.com/microsoft/InnerEye-DeepLearning/pull/536)) Inference will not run on the validation set by default, this can be turned on
via the `--inference_on_val_set` flag.
+- ([#548](https://github.com/microsoft/InnerEye-DeepLearning/pull/548)) Many Azure-related functions have been moved
+out of the toolbox, into the separate hi-ml Python package.
- ([#502](https://github.com/microsoft/InnerEye-DeepLearning/pull/502)) Renamed command line option 'perform_training_set_inference' to 'inference_on_train_set'. Replaced command line option 'perform_validation_and_test_set_inference' with the pair of options 'inference_on_val_set' and 'inference_on_test_set'.
- ([#496](https://github.com/microsoft/InnerEye-DeepLearning/pull/496)) All plots are now saved as PNG, rather than JPG.
- ([#497](https://github.com/microsoft/InnerEye-DeepLearning/pull/497)) Reducing the size of the code snapshot that
diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py
index 3c32e2a04..eae73d6d2 100755
--- a/InnerEye/Azure/azure_config.py
+++ b/InnerEye/Azure/azure_config.py
@@ -6,20 +6,17 @@
import getpass
import logging
-import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
import param
-from azureml.core import Dataset, Datastore, Run, ScriptRunConfig, Workspace
+from azureml.core import Run, ScriptRunConfig, Workspace
from azureml.core.authentication import InteractiveLoginAuthentication, ServicePrincipalAuthentication
-from azureml.data import FileDataset
-from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.train.hyperdrive import HyperDriveConfig
from git import Repo
-from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context, remove_arg
+from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context
from InnerEye.Azure.secrets_handling import SecretsHandling, read_all_settings
from InnerEye.Common import fixed_paths
from InnerEye.Common.generic_parsing import GenericConfig
@@ -240,64 +237,6 @@ def fetch_run(self, run_recovery_id: str) -> Run:
"""
return fetch_run(workspace=self.get_workspace(), run_recovery_id=run_recovery_id)
- def get_or_create_dataset(self, azure_dataset_id: str) -> FileDataset:
- """
- Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is
- created and registered, assuming that the files are in a folder that has the same name as the dataset.
- For example, if azure_dataset_id is 'foo', then the 'foo' dataset should be pointing to the folder
- /datasets/foo/
- """
- if not self.azureml_datastore:
- raise ValueError("No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)")
- if not azure_dataset_id:
- raise ValueError("No dataset ID provided.")
- workspace = self.get_workspace()
- logging.info(f"Retrieving datastore '{self.azureml_datastore}' from AzureML workspace {workspace.name}")
- datastore = Datastore.get(workspace, self.azureml_datastore)
- try:
- logging.info(f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
- azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
- logging.info("Dataset found.")
- except:
- logging.info(f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'")
- # Ensure that there is a / at the end of the file path, otherwise folder that share a prefix could create
- # trouble (for example, folders foo and foo_bar exist, and I'm trying to create a dataset from "foo")
- azureml_dataset = Dataset.File.from_files(path=(datastore, azure_dataset_id + "/"))
- logging.info("Registering the dataset for future use.")
- azureml_dataset.register(workspace, name=azure_dataset_id)
- return azureml_dataset
-
- def get_dataset_consumption(self,
- azure_dataset_id: str,
- dataset_index: int,
- mountpoint: str) -> DatasetConsumptionConfig:
- """
- Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
- dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
- :param mountpoint: The path at which the dataset should be made available.
- :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
- string to not use any datasets.
- :param dataset_index: suffix for the dataset name, dataset name will be set to INPUT_DATA_KEY_idx
- """
- status = f"Dataset {azure_dataset_id} (index {dataset_index}) will be "
- azureml_dataset = self.get_or_create_dataset(azure_dataset_id=azure_dataset_id)
- if not azureml_dataset:
- raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
- named_input = azureml_dataset.as_named_input(f"{INPUT_DATA_KEY}_{dataset_index}")
- path_on_compute = mountpoint or None
- if self.use_dataset_mount:
- status += "mounted at "
- result = named_input.as_mount(path_on_compute)
- else:
- status += "downloaded to "
- result = named_input.as_download(path_on_compute)
- if path_on_compute:
- status += f"{path_on_compute}."
- else:
- status += "a randomly chosen folder."
- logging.info(status)
- return result
-
@dataclass
class SourceConfig:
@@ -313,13 +252,6 @@ class SourceConfig:
upload_timeout_seconds: int = 36000
environment_variables: Optional[Dict[str, str]] = None
- def set_script_params_except_submit_flag(self) -> None:
- """
- Populates the script_param field of the present object from the arguments in sys.argv, with the exception
- of the "azureml" flag.
- """
- self.script_params = remove_arg(AZURECONFIG_SUBMIT_TO_AZUREML, sys.argv[1:])
-
@dataclass
class ParserResult:
diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py
index 888623c35..63c7b1ab4 100644
--- a/InnerEye/Azure/azure_runner.py
+++ b/InnerEye/Azure/azure_runner.py
@@ -4,37 +4,25 @@
# ------------------------------------------------------------------------------------------
import argparse
import getpass
-import hashlib
import logging
import os
-import signal
import sys
from argparse import ArgumentError, ArgumentParser, Namespace
from datetime import date
from pathlib import Path
from typing import Any, Dict, List, Optional
-from azureml.core import Environment, Experiment, Run, ScriptRunConfig
-from azureml.core.runconfig import MpiConfiguration, RunConfiguration
-from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
-from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
-
-from InnerEye.Azure import azure_util
-from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
-from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME, \
- RUN_RECOVERY_ID_KEY_NAME, is_offline_run_context, merge_conda_dependencies
+from InnerEye.Azure.azure_config import AzureConfig, ParserResult
+from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME
from InnerEye.Azure.secrets_handling import read_all_settings
-from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.utils.config_loader import ModelConfigLoader
+from health.azure.datasets import DatasetConfig
SLEEP_TIME_SECONDS = 30
-RUN_RECOVERY_FILE = "most_recent_run.txt"
-# The version to use when creating an AzureML Python environment. We create all environments with a unique hashed
-# name, hence version will always be fixed
-ENVIRONMENT_VERSION = "1"
+DEFAULT_DOCKER_BASE_IMAGE = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
# Environment variables used for multi-node training
ENV_AZ_BATCHAI_MPI_MASTER_NODE = "AZ_BATCHAI_MPI_MASTER_NODE"
@@ -47,47 +35,6 @@
ENV_LOCAL_RANK = "LOCAL_RANK"
-def submit_to_azureml(azure_config: AzureConfig,
- source_config: SourceConfig,
- all_azure_dataset_ids: List[str],
- all_dataset_mountpoints: List[str]) -> Run:
- """
- The main entry point when submitting the runner script to AzureML.
- It creates an AzureML workspace if needed, submits an experiment using the code
- as specified in source_config, and waits for completion if needed.
- :param azure_config: azure related configurations to setup valid workspace
- :param source_config: The information about which code should be submitted, and which arguments should be used.
- :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
- :param all_dataset_mountpoints: When using mounted datasets in AzureML, these are the per-dataset mount points.
- The list must have the same length as all_azure_dataset_ids.
- """
- azure_run: Optional[Run] = None
-
- # When running as part of the PR build, jobs frequently get interrupted by new pushes to the repository.
- # In this case, we'd like to cancel the current AzureML run before exiting, to reduce cost.
- # However, at present, this does NOT work, the SIGINT is not propagated through.
- def interrupt_handler(signal: int, _: Any) -> None:
- logging.info('Process interrupted via signal {}'.format(str(signal)))
- if azure_run:
- logging.info('Trying to terminate the AzureML job now.')
- azure_run.cancel()
- sys.exit(0)
-
- for s in [signal.SIGINT, signal.SIGTERM]:
- signal.signal(s, interrupt_handler)
- # create train/test experiment
- script_run_config = create_run_config(azure_config, source_config, all_azure_dataset_ids, all_dataset_mountpoints)
- commandline_args = " ".join(source_config.script_params)
- azure_run = create_and_submit_experiment(azure_config, script_run_config, commandline_args=commandline_args)
-
- if azure_config.wait_for_completion:
- # We want the job output to be visible on the console, but the program should not exit if the
- # job fails because we need to download the pytest result file.
- azure_run.wait_for_completion(show_output=True, raise_on_error=False)
-
- return azure_run
-
-
def get_git_tags(azure_config: AzureConfig) -> Dict[str, str]:
"""
Creates a dictionary with git-related information, like branch and commit ID. The dictionary key is a string
@@ -107,27 +54,25 @@ def get_git_tags(azure_config: AzureConfig) -> Dict[str, str]:
}
-def set_run_tags(run: Run, azure_config: AzureConfig, commandline_args: str) -> None:
+def additional_run_tags(azure_config: AzureConfig, commandline_args: str) -> Dict[str, str]:
"""
- Set metadata for the run
- :param run: Run to set metadata for.
+ Gets the set of tags that will be added to the AzureML run as metadata, like git status and user name.
:param azure_config: The configurations for the present AzureML job
:param commandline_args: A string that holds all commandline arguments that were used for the present run.
"""
git_information = get_git_tags(azure_config)
- run.set_tags({
+ return {
"tag": azure_config.tag,
"model_name": azure_config.model,
"execution_mode": ModelExecutionMode.TRAIN.value if azure_config.train else ModelExecutionMode.TEST.value,
- RUN_RECOVERY_ID_KEY_NAME: azure_util.create_run_recovery_id(run=run),
RUN_RECOVERY_FROM_ID_KEY_NAME: azure_config.run_recovery_id,
"build_number": str(azure_config.build_number),
"build_user": azure_config.build_user,
"build_user_email": azure_config.build_user_email,
**git_information,
"commandline_args": commandline_args,
- CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: -1,
- })
+ CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: "-1",
+ }
def create_experiment_name(azure_config: AzureConfig) -> str:
@@ -144,136 +89,19 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
return branch or getpass.getuser() + f"_local_branch_{date.today().strftime('%Y%m')}"
-def create_and_submit_experiment(azure_config: AzureConfig,
- script_run_config: ScriptRunConfig,
- commandline_args: str) -> Run:
- """
- Creates an AzureML experiment in the workspace and submits it for execution.
- :param azure_config: azure related configurations to setup a valid workspace.
- :param script_run_config: The configuration for the script that should be run inside of AzureML.
- :param commandline_args: A string with all commandline arguments that were provided to the runner. These are only
- used to set a tag on the submitted AzureML run.
- :returns: Run object for the submitted AzureML run
- """
- workspace = azure_config.get_workspace()
- experiment_name = create_experiment_name(azure_config)
- exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name))
-
- # submit a training/testing run associated with the experiment
- run: Run = exp.submit(script_run_config)
-
- if is_offline_run_context(run):
- # This codepath will only be executed in unit tests, when exp.submit is mocked.
- return run
-
- # Set metadata for the run.
- set_run_tags(run, azure_config, commandline_args=commandline_args)
-
- print("\n==============================================================================")
- print(f"Successfully queued new run {run.id} in experiment: {exp.name}")
-
- if azure_config.run_recovery_id:
- print(f"\nRecovered from: {azure_config.run_recovery_id}")
-
- recovery_id = azure_util.create_run_recovery_id(run)
- recovery_file = Path(RUN_RECOVERY_FILE)
- if recovery_file.exists():
- recovery_file.unlink()
- recovery_file.write_text(recovery_id)
-
- print("Experiment URL: {}".format(exp.get_portal_url()))
- print("Run URL: {}".format(run.get_portal_url()))
- print("If this run fails, re-start runner.py and supply these additional arguments: "
- f"--run_recovery_id={recovery_id}")
- print(f"The run recovery ID has been written to this file: {recovery_file}")
- print("==============================================================================")
- if azure_config.tensorboard and azure_config.azureml:
- print("Starting TensorBoard now because you specified --tensorboard")
- monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config)
- else:
- print(f"To monitor this run locally using TensorBoard, run the script: "
- f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
- print("==============================================================================")
- return run
-
-
-def get_or_create_python_environment(azure_config: AzureConfig,
- source_config: SourceConfig,
- environment_name: str = "",
- register_environment: bool = True) -> Environment:
- """
- Creates a description for the Python execution environment in AzureML, based on the Conda environment
- definition files that are specified in `source_config`. If such environment with this Conda environment already
- exists, it is retrieved, otherwise created afresh.
- :param azure_config: azure related configurations to use for model scale-out behaviour
- :param source_config: configurations for model execution, such as name and execution mode
- :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
- is not found, create one from the Conda files provided. This parameter is meant to be used when running
- inference for an existing model.
- :param register_environment: If True, the Python environment will be registered in the AzureML workspace. If
- False, it will only be created, but not registered. Use this for unit testing.
- """
- # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not be
- # necessary if the innereye package is installed. It is necessary when working with an outer project and
- # InnerEye as a git submodule and submitting jobs from the local machine.
- # In case of version conflicts, the package version in the outer project is given priority.
- conda_dependencies, merged_yaml = merge_conda_dependencies(source_config.conda_dependencies_files) # type: ignore
- if azure_config.pip_extra_index_url:
- # When an extra-index-url is supplied, swap the order in which packages are searched for.
- # This is necessary if we need to consume packages from extra-index that clash with names of packages on
- # pypi
- conda_dependencies.set_pip_option(f"--index-url {azure_config.pip_extra_index_url}")
- conda_dependencies.set_pip_option("--extra-index-url https://pypi.org/simple")
- env_variables = {
- "AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC": str(source_config.upload_timeout_seconds),
- # Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300.
- "AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900",
- "MKL_SERVICE_FORCE_INTEL": "1",
- # Switching to a new software stack in AML for mounting datasets
- "RSLEX_DIRECT_VOLUME_MOUNT": "true",
- "RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1",
- **(source_config.environment_variables or {})
- }
- base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
- # Create a name for the environment that will likely uniquely identify it. AzureML does hashing on top of that,
- # and will re-use existing environments even if they don't have the same name.
- # Hashing should include everything that can reasonably change. Rely on hashlib here, because the built-in
- # hash function gives different results for the same string in different python instances.
- hash_string = "\n".join([merged_yaml, azure_config.docker_shm_size, base_image, str(env_variables)])
- sha1 = hashlib.sha1(hash_string.encode("utf8"))
- overall_hash = sha1.hexdigest()[:32]
- unique_env_name = f"InnerEye-{overall_hash}"
- try:
- env_name_to_find = environment_name or unique_env_name
- env = Environment.get(azure_config.get_workspace(), name=env_name_to_find, version=ENVIRONMENT_VERSION)
- logging.info(f"Using existing Python environment '{env.name}'.")
- return env
- except Exception:
- logging.info(f"Python environment '{unique_env_name}' does not yet exist, creating and registering it.")
- env = Environment(name=unique_env_name)
- env.docker.enabled = True
- env.docker.shm_size = azure_config.docker_shm_size
- env.python.conda_dependencies = conda_dependencies
- env.docker.base_image = base_image
- env.environment_variables = env_variables
- if register_environment:
- env.register(azure_config.get_workspace())
- return env
-
-
-def create_dataset_consumptions(azure_config: AzureConfig,
- all_azure_dataset_ids: List[str],
- all_dataset_mountpoints: List[str]) -> List[DatasetConsumptionConfig]:
+def create_dataset_configs(azure_config: AzureConfig,
+ all_azure_dataset_ids: List[str],
+ all_dataset_mountpoints: List[str]) -> List[DatasetConfig]:
"""
Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be
skipped.
:param azure_config: azure related configurations to use for model scale-out behaviour
:param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
:param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
- :return: A list of DatasetConsumptionConfig, in the same order as datasets were provided in all_azure_dataset_ids,
+ :return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids,
omitting datasets with an empty name.
"""
- dataset_consumptions: List[DatasetConsumptionConfig] = []
+ datasets: List[DatasetConfig] = []
if len(all_dataset_mountpoints) > 0:
if len(all_azure_dataset_ids) != len(all_dataset_mountpoints):
raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) "
@@ -282,64 +110,14 @@ def create_dataset_consumptions(azure_config: AzureConfig,
all_dataset_mountpoints = [""] * len(all_azure_dataset_ids)
for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)):
if dataset_id:
- dataset_consumption = azure_config.get_dataset_consumption(dataset_id, i, mount_point)
- dataset_consumptions.append(dataset_consumption)
+ datasets.append(DatasetConfig(name=dataset_id,
+ target_folder=mount_point,
+ use_mounting=azure_config.use_dataset_mount,
+ datastore=azure_config.azureml_datastore))
elif mount_point:
raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has "
f"been provided ('{mount_point}')")
- return dataset_consumptions
-
-
-def create_run_config(azure_config: AzureConfig,
- source_config: SourceConfig,
- all_azure_dataset_ids: List[str],
- all_dataset_mountpoints: List[str],
- environment_name: str = "") -> ScriptRunConfig:
- """
- Creates a configuration to run the InnerEye training script in AzureML.
- :param azure_config: azure related configurations to use for model scale-out behaviour
- :param source_config: configurations for model execution, such as name and execution mode
- :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
- :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
- :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
- is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
- when running inference for an existing model.
- :return: The configured script run.
- """
- dataset_consumptions = create_dataset_consumptions(azure_config, all_azure_dataset_ids, all_dataset_mountpoints)
- # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
- entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix()
- logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
- f"source directory {source_config.root_folder})")
- max_run_duration = None
- if azure_config.max_run_duration:
- max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration)
- workspace = azure_config.get_workspace()
- run_config = RunConfiguration(
- script=entry_script_relative_path,
- arguments=source_config.script_params,
- )
- run_config.environment = get_or_create_python_environment(azure_config, source_config,
- environment_name=environment_name)
- run_config.target = azure_config.cluster
- run_config.max_run_duration_seconds = max_run_duration
- if azure_config.num_nodes > 1:
- distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes)
- run_config.mpi = distributed_job_config
- run_config.framework = "Python"
- run_config.communicator = "IntelMpi"
- run_config.node_count = distributed_job_config.node_count
- if len(dataset_consumptions) > 0:
- run_config.data = {dataset.name: dataset for dataset in dataset_consumptions}
- # Use blob storage for storing the source, rather than the FileShares section of the storage account.
- run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
- script_run_config = ScriptRunConfig(
- source_directory=str(source_config.root_folder),
- run_config=run_config,
- )
- if azure_config.hyperdrive:
- script_run_config = source_config.hyperdrive_config_func(script_run_config) # type: ignore
- return script_run_config
+ return datasets
def create_runner_parser(model_config_class: type = None) -> argparse.ArgumentParser:
diff --git a/InnerEye/Azure/azure_util.py b/InnerEye/Azure/azure_util.py
index 210480022..c91c6cc2a 100644
--- a/InnerEye/Azure/azure_util.py
+++ b/InnerEye/Azure/azure_util.py
@@ -5,19 +5,15 @@
import logging
import os
import re
-import tempfile
from pathlib import Path
from typing import Generator, List, Optional, Tuple
-import conda_merge
-import ruamel.yaml
-from azureml._restclient.constants import RunStatus
from azureml.core import Experiment, Run, Workspace, get_run
-from azureml.core.conda_dependencies import CondaDependencies
from azureml.exceptions import UserErrorException
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
+from health.azure.azure_util import create_run_recovery_id
DEFAULT_CROSS_VALIDATION_SPLIT_INDEX = -1
EXPERIMENT_RUN_SEPARATOR = ":"
@@ -45,16 +41,6 @@
INNEREYE_SDK_VERSION = "1.0"
-def create_run_recovery_id(run: Run) -> str:
- """
- Creates an recovery id for a run so it's checkpoints could be recovered for training/testing
-
- :param run: an instantiated run.
- :return: recovery id for a given run in format: [experiment name]:[run id]
- """
- return str(run.experiment.name + EXPERIMENT_RUN_SEPARATOR + run.id)
-
-
def split_recovery_id(id: str) -> Tuple[str, str]:
"""
Splits a run ID into the experiment name and the actual run.
@@ -149,12 +135,13 @@ def fetch_child_runs(run: Run, status: Optional[str] = None,
run = PARENT_RUN_CONTEXT
children_runs = list(run.get_children(tags=RUN_RECOVERY_ID_KEY_NAME))
if 0 < expected_number_cross_validation_splits != len(children_runs):
- logging.warning(
- f"The expected number of child runs was {expected_number_cross_validation_splits}."
- f"Fetched only: {len(children_runs)} runs. Now trying to fetch them manually.")
- run_ids_to_evaluate = [f"{create_run_recovery_id(run)}_{i}"
- for i in range(expected_number_cross_validation_splits)]
- children_runs = [fetch_run(run.experiment.workspace, id) for id in run_ids_to_evaluate]
+ if 0 < expected_number_cross_validation_splits != len(children_runs):
+ logging.warning(
+ f"The expected number of child runs was {expected_number_cross_validation_splits}."
+ f"Fetched only: {len(children_runs)} runs. Now trying to fetch them manually.")
+ run_ids_to_evaluate = [f"{create_run_recovery_id(run)}_{i}"
+ for i in range(expected_number_cross_validation_splits)]
+ children_runs = [fetch_run(run.experiment.workspace, id) for id in run_ids_to_evaluate]
if status is not None:
children_runs = [child_run for child_run in children_runs if child_run.get_status() == status]
return children_runs
@@ -237,53 +224,6 @@ def strip_prefix(string: str, prefix: str) -> str:
return string
-def _log_conda_dependencies_stats(conda: CondaDependencies, message_prefix: str) -> None:
- """
- Write number of conda and pip packages to logs.
- :param conda: A conda dependencies object
- :param message_prefix: A message to prefix to the log string.
- """
- conda_packages_count = len(list(conda.conda_packages))
- pip_packages_count = len(list(conda.pip_packages))
- logging.info(f"{message_prefix}: {conda_packages_count} conda packages, {pip_packages_count} pip packages")
- logging.debug(" Conda packages:")
- for p in conda.conda_packages:
- logging.debug(f" {p}")
- logging.debug(" Pip packages:")
- for p in conda.pip_packages:
- logging.debug(f" {p}")
-
-
-def merge_conda_files(files: List[Path], result_file: Path) -> None:
- """
- Merges the given Conda environment files using the conda_merge package, and writes the merged file to disk.
- :param files: The Conda environment files to read.
- :param result_file: The location where the merge results should be written.
- """
- # This code is a slightly modified version of conda_merge. That code can't be re-used easily
- # it defaults to writing to stdout
- env_definitions = [conda_merge.read_file(str(f)) for f in files]
- unified_definition = {}
- NAME = "name"
- CHANNELS = "channels"
- DEPENDENCIES = "dependencies"
- name = conda_merge.merge_names(env.get(NAME) for env in env_definitions)
- if name:
- unified_definition[NAME] = name
- try:
- channels = conda_merge.merge_channels(env.get(CHANNELS) for env in env_definitions)
- except conda_merge.MergeError:
- logging.error("Failed to merge channel priorities.")
- raise
- if channels:
- unified_definition[CHANNELS] = channels
- deps = conda_merge.merge_dependencies(env.get(DEPENDENCIES) for env in env_definitions)
- if deps:
- unified_definition[DEPENDENCIES] = deps
- with result_file.open("w") as f:
- ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
-
-
def get_all_environment_files(project_root: Path) -> List[Path]:
"""
Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
@@ -299,27 +239,6 @@ def get_all_environment_files(project_root: Path) -> List[Path]:
return files
-def merge_conda_dependencies(files: List[Path]) -> Tuple[CondaDependencies, str]:
- """
- Creates a CondaDependencies object from the Conda environments specified in one or more files.
- The resulting object contains the union of the Conda and pip packages in the files, where merging
- is done via the conda_merge package.
- :param files: The Conda environment files to read.
- :return: Tuple of (CondaDependencies object that contains packages from all the files,
- string contents of the merge Conda environment)
- """
- for file in files:
- _log_conda_dependencies_stats(CondaDependencies(file), f"Conda environment in {file}")
- temp_merged_file = tempfile.NamedTemporaryFile(delete=False)
- merged_file_path = Path(temp_merged_file.name)
- merge_conda_files(files, result_file=merged_file_path)
- merged_dependencies = CondaDependencies(temp_merged_file.name)
- _log_conda_dependencies_stats(merged_dependencies, "Merged Conda environment")
- merged_file_contents = merged_file_path.read_text()
- temp_merged_file.close()
- return merged_dependencies, merged_file_contents
-
-
def tag_values_all_distinct(runs: List[Run], tag: str) -> bool:
"""
Returns True iff the runs all have the specified tag and all the values are different.
@@ -395,26 +314,6 @@ def is_running_on_azure_agent() -> bool:
return bool(os.environ.get("AGENT_OS", None))
-def is_run_and_child_runs_completed(run: Run) -> bool:
- """
- Checks if the given run has successfully completed. If the run has child runs, it also checks if the child runs
- completed successfully.
- :param run: The AzureML run to check.
- :return: True if the run and all child runs completed successfully.
- """
-
- def is_completed(run: Run) -> bool:
- status = run.get_status()
- if run.status == RunStatus.COMPLETED:
- return True
- logging.info(f"Run {run.id} in experiment {run.experiment.name} finished with status {status}.")
- return False
-
- runs = list(run.get_children())
- runs.append(run)
- return all(is_completed(run) for run in runs)
-
-
def get_comparison_baseline_paths(outputs_folder: Path,
blob_path: Path, run: Run,
dataset_csv_file_name: str) -> \
@@ -458,43 +357,3 @@ def step_up_directories(path: Path) -> Generator[Path, None, None]:
if parent == path:
break
path = parent
-
-
-def remove_arg(arg: str, args: List[str]) -> List[str]:
- """
- Remove an argument from a list of arguments. The argument list is assumed to contain
- elements of the form:
- "-a", "--arg1", "--arg2", "value2", or "--arg3=value"
- If there is an item matching "--arg" then it will be removed from the list.
-
- :param arg: Argument to look for.
- :param args: List of arguments to scan.
- :return: List of arguments with --arg removed, if present.
- """
- arg_opt = f"--{arg}"
- no_arg_opt = f"--no-{arg}"
- retained_args = []
- i = 0
- while i < len(args):
- arg = args[i]
- if arg.startswith(arg_opt):
- if len(arg) == len(arg_opt):
- # The commandline argument is "--arg", with something possibly following: This can either be
- # "--arg_opt value" or "--arg_opt --some_other_param"
- if i < (len(args) - 1):
- # If the next argument starts with a "-" then assume that it does not belong to the --arg
- # argument. If there is no "-", assume it belongs to the --arg_opt argument, and skip both
- if not args[i + 1].startswith("-"):
- i = i + 1
- elif arg[len(arg_opt)] == "=":
- # The commandline argument is "--arg=value": Continue with next arg
- pass
- else:
- # The argument list contains an argument like "--arg_other_param": Keep that.
- retained_args.append(arg)
- elif arg == no_arg_opt:
- pass
- else:
- retained_args.append(arg)
- i = i + 1
- return retained_args
diff --git a/InnerEye/Common/common_util.py b/InnerEye/Common/common_util.py
index d1fb1f6f5..0433da5a5 100644
--- a/InnerEye/Common/common_util.py
+++ b/InnerEye/Common/common_util.py
@@ -14,7 +14,6 @@
from pathlib import Path
from typing import Any, Callable, Generator, Iterable, List, Optional, Union
-from InnerEye.Common import fixed_paths
from InnerEye.Common.fixed_paths import repository_root_directory
from InnerEye.Common.type_annotations import PathOrString
from InnerEye.ML.common import ModelExecutionMode
@@ -25,7 +24,6 @@
empty_string_to_none = lambda x: None if (x is None or len(x.strip()) == 0) else x
string_to_path = lambda x: None if (x is None or len(x.strip()) == 0) else Path(x)
-
SUBJECT_METRICS_FILE_NAME = "metrics.csv"
EPOCH_METRICS_FILE_NAME = "epoch_metrics.csv"
METRICS_AGGREGATES_FILE = "metrics_aggregates.csv"
@@ -390,20 +388,6 @@ def remove_file_or_directory(pth: Path) -> None:
pth.unlink()
-def add_folder_to_sys_path_if_needed(folder_under_repo_root: str) -> None:
- """
- Checks if the Python paths in sys.path already contain the given folder, which is expected to be relative
- to the repository root. If that folder is not yet in sys.path, add it.
- """
- full_folder = repository_root_directory() / folder_under_repo_root
- for path_str in sys.path:
- path = Path(path_str)
- if path == full_folder:
- return
- print(f"Adding {full_folder} to sys.path")
- sys.path.append(str(full_folder))
-
-
@contextmanager
def change_working_directory(path_or_str: PathOrString) -> Generator:
"""
@@ -414,16 +398,3 @@ def change_working_directory(path_or_str: PathOrString) -> Generator:
os.chdir(new_path)
yield
os.chdir(old_path)
-
-
-@contextmanager
-def append_to_amlignore(lines_to_append: List[str]) -> Generator:
- """
- Context manager that appends lines to the .amlignore file, and reverts to the previous contents after.
- """
- amlignore = fixed_paths.repository_root_directory(".amlignore")
- old_contents = amlignore.read_text()
- new_contents = old_contents.splitlines() + lines_to_append
- amlignore.write_text("\n".join(new_contents))
- yield
- amlignore.write_text(old_contents)
diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py
index 5d9e92934..cc0b5c922 100755
--- a/InnerEye/Common/fixed_paths.py
+++ b/InnerEye/Common/fixed_paths.py
@@ -2,7 +2,9 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
+import logging
import os
+import sys
from pathlib import Path
from typing import Optional
@@ -91,3 +93,28 @@ def get_environment_yaml_file() -> Path:
raise ValueError(f"File {ENVIRONMENT_YAML_FILE_NAME} was not found not found in the package folder "
f"{INNEREYE_PACKAGE_ROOT}, and not in the repository root {repository_root_directory()}.")
return env
+
+
+def add_submodules_to_path() -> None:
+ """
+ This function adds all submodules that the code uses to sys.path and to the environment variables. This is
+ necessary to make the code work without any further changes when switching from/to using hi-ml as a package
+ or as a submodule for development.
+ It also adds the InnerEye root folder to sys.path. The latter is necessary to make AzureML and Pytorch Lightning
+ work together: When spawning additional processes for DDP, the working directory is not correctly picked
+ up in sys.path.
+ """
+ innereye_root = repository_root_directory()
+ folders_to_add = [(innereye_root, "InnerEye"),
+ (innereye_root / "fastMRI", "fastmri"),
+ (innereye_root / "hi-ml" / "src", "health")]
+ for (folder, subfolder_that_must_exist) in folders_to_add:
+ if (folder / subfolder_that_must_exist).is_dir():
+ folder_str = str(folder)
+ if folder_str not in sys.path:
+ logging.debug(f"Adding folder {folder} to sys.path")
+ sys.path.insert(0, folder_str)
+ else:
+ logging.debug(f"Not adding folder {folder} because it is already in sys.path")
+ else:
+ logging.debug(f"Not adding folder {folder} because it does not have subfolder {subfolder_that_must_exist}")
diff --git a/InnerEye/ML/configs/other/fastmri_varnet.py b/InnerEye/ML/configs/other/fastmri_varnet.py
index 9a77bbe40..29618a49c 100644
--- a/InnerEye/ML/configs/other/fastmri_varnet.py
+++ b/InnerEye/ML/configs/other/fastmri_varnet.py
@@ -16,11 +16,7 @@
from pytorch_lightning import LightningDataModule, LightningModule
from torch.utils.tensorboard import SummaryWriter
-from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
from InnerEye.ML.lightning_container import LightningContainer
-
-add_folder_to_sys_path_if_needed("fastMRI")
-
from fastmri.data.subsample import create_mask_for_mask_type
from fastmri.data.transforms import VarNetDataTransform
from fastmri.pl_modules import FastMriDataModule, VarNetModule
diff --git a/InnerEye/ML/normalize_and_visualize_dataset.py b/InnerEye/ML/normalize_and_visualize_dataset.py
index 7872e89ef..deca45166 100644
--- a/InnerEye/ML/normalize_and_visualize_dataset.py
+++ b/InnerEye/ML/normalize_and_visualize_dataset.py
@@ -73,9 +73,8 @@ def main(yaml_file_path: Path) -> None:
In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
"""
config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
- local_dataset = MLRunner(config, azure_config=runner_config).mount_or_download_dataset(config.azure_dataset_id,
- config.local_dataset)
- assert local_dataset is not None
+ runner = MLRunner(config, azure_config=runner_config)
+ local_dataset = runner.download_or_use_existing_dataset(config.azure_dataset_id, config.local_dataset)
dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
normalizer_config = NormalizeAndVisualizeConfig(**args)
actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id
diff --git a/InnerEye/ML/reports/classification_crossval_report.ipynb b/InnerEye/ML/reports/classification_crossval_report.ipynb
index 934f81ebf..1c364f294 100644
--- a/InnerEye/ML/reports/classification_crossval_report.ipynb
+++ b/InnerEye/ML/reports/classification_crossval_report.ipynb
@@ -50,6 +50,8 @@
"import sys\n",
"if str(innereye_path) not in sys.path:\n",
" sys.path.append(str(innereye_path))\n",
+ "from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
+ "add_submodules_to_path()\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
diff --git a/InnerEye/ML/reports/classification_multilabel_report.ipynb b/InnerEye/ML/reports/classification_multilabel_report.ipynb
index 91e453611..daaf7224f 100644
--- a/InnerEye/ML/reports/classification_multilabel_report.ipynb
+++ b/InnerEye/ML/reports/classification_multilabel_report.ipynb
@@ -56,6 +56,8 @@
"import sys\n",
"if str(innereye_path) not in sys.path:\n",
" sys.path.append(str(innereye_path))\n",
+ "from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
+ "add_submodules_to_path()\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
diff --git a/InnerEye/ML/reports/segmentation_report.ipynb b/InnerEye/ML/reports/segmentation_report.ipynb
index b9344b8c4..18cb6a1d2 100644
--- a/InnerEye/ML/reports/segmentation_report.ipynb
+++ b/InnerEye/ML/reports/segmentation_report.ipynb
@@ -45,6 +45,9 @@
"\n",
"if str(innereye_path) not in sys.path:\n",
" sys.path.append(str(innereye_path))\n",
+ "from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
+ "add_submodules_to_path()\n",
+ "\n",
"from InnerEye.ML.reports.segmentation_report import plot_scores_for_csv\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
@@ -159,4 +162,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
index bd0651e87..38dca7724 100644
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@@ -21,12 +21,12 @@
from torch.utils.data import DataLoader
from InnerEye.Azure import azure_util
-from InnerEye.Azure.azure_config import AzureConfig, INPUT_DATA_KEY
-from InnerEye.Azure.azure_runner import ENVIRONMENT_VERSION, ENV_OMPI_COMM_WORLD_RANK, get_git_tags
+from InnerEye.Azure.azure_config import AzureConfig
+from InnerEye.Azure.azure_runner import ENV_OMPI_COMM_WORLD_RANK, get_git_tags
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, \
EFFECTIVE_RANDOM_SEED_KEY_NAME, IS_ENSEMBLE_KEY_NAME, MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, \
PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT, RUN_RECOVERY_FROM_ID_KEY_NAME, RUN_RECOVERY_ID_KEY_NAME, \
- create_run_recovery_id, get_all_environment_files, is_offline_run_context, merge_conda_files
+ get_all_environment_files, is_offline_run_context
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE, \
CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
@@ -35,11 +35,12 @@
change_working_directory, get_best_epoch_results_path, is_windows, logging_section, logging_to_file, \
print_exception, remove_file_or_directory
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, LOG_FILE_NAME, PYTHON_ENVIRONMENT_NAME
+from InnerEye.Common.type_annotations import PathOrString
from InnerEye.ML.baselines_util import compare_folders_and_run_outputs
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.config import SegmentationModelBase
-from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, FINAL_ENSEMBLE_MODEL_FOLDER, \
- FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod, load_checkpoint, EXTRA_RUN_SUBFOLDER
+from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, EXTRA_RUN_SUBFOLDER, \
+ FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod, load_checkpoint
from InnerEye.ML.lightning_base import InnerEyeContainer
from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
from InnerEye.ML.metrics import InferenceMetrics, InferenceMetricsForSegmentation
@@ -57,26 +58,14 @@
from InnerEye.ML.visualizers import activation_maps
from InnerEye.ML.visualizers.plot_cross_validation import \
get_config_and_results_for_offline_runs, plot_cross_validation_from_files
+from health.azure.azure_util import ENVIRONMENT_VERSION, create_run_recovery_id, merge_conda_files
+from health.azure.datasets import get_or_create_dataset
+from health.azure.himl import AzureRunInfo
ModelDeploymentHookSignature = Callable[[LightningContainer, AzureConfig, Model, ModelProcessing], Any]
PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
-def try_to_mount_input_dataset(dataset_index: int = 0) -> Optional[Path]:
- """
- Checks if the AzureML run context has a field for input datasets. If yes, the dataset stored there is
- returned as a Path. Returns None if no input datasets was found.
-
- :param dataset_index: suffix of AML dataset name, return path to INPUT_DATA_KEY_idx dataset
- """
- if hasattr(RUN_CONTEXT, "input_datasets"):
- try:
- return Path(RUN_CONTEXT.input_datasets[f"{INPUT_DATA_KEY}_{dataset_index}"])
- except KeyError:
- logging.warning(f"Run context field input_datasets has no {INPUT_DATA_KEY}_{dataset_index} entry.")
- return None
-
-
def download_dataset(azure_dataset_id: str,
target_folder: Path,
dataset_csv: str,
@@ -95,7 +84,9 @@ def download_dataset(azure_dataset_id: str,
:return: A path on the local machine that contains the dataset.
"""
logging.info("Trying to download dataset via AzureML datastore now.")
- azure_dataset = azure_config.get_or_create_dataset(azure_dataset_id)
+ azure_dataset = get_or_create_dataset(workspace=azure_config.get_workspace(),
+ datastore_name=azure_config.azureml_datastore,
+ dataset_name=azure_dataset_id)
if not isinstance(azure_dataset, FileDataset):
raise ValueError(f"Expected to get a FileDataset, but got {type(azure_dataset)}")
# The downloaded dataset may already exist from a previous run.
@@ -122,6 +113,20 @@ def download_dataset(azure_dataset_id: str,
return expected_dataset_path
+def check_dataset_folder_exists(local_dataset: PathOrString) -> Path:
+ """
+ Checks if a folder with a local dataset exists. If it does exist, return the argument converted to a Path instance.
+ If it does not exist, raise a FileNotFoundError.
+ :param local_dataset: The dataset folder to check.
+ :return: The local_dataset argument, converted to a Path.
+ """
+ expected_dir = Path(local_dataset)
+ if not expected_dir.is_dir():
+ raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
+ logging.info(f"Model training will use the local dataset provided in {expected_dir}")
+ return expected_dir
+
+
def log_metrics(metrics: Dict[ModelExecutionMode, InferenceMetrics],
run_context: Run) -> None:
"""
@@ -185,36 +190,40 @@ def __init__(self,
self.output_subfolder = output_subfolder
self._has_setup_run = False
- def setup(self, use_mount_or_download_dataset: bool = True) -> None:
+ def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None:
"""
If the present object is using one of the InnerEye built-in models, create a (fake) container for it
and call the setup method. It sets the random seeds, and then creates the actual Lightning modules.
- :param use_mount_or_download_dataset: If True, try to download or mount the dataset that is used by the model.
- If False, assume that the dataset is already available (this should only be used for unit tests).
+ :param azure_run_info: When running in AzureML or on a local VM, this contains the paths to the datasets.
+ This can be missing when running in unit tests, where the local dataset paths are already populated.
"""
if self._has_setup_run:
return
- if (not self.azure_config.only_register_model) and use_mount_or_download_dataset:
+ if (not self.azure_config.only_register_model) and azure_run_info:
+ dataset_index = 0
# Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails
# and config.local_dataset was not already set.
# This must happen before container setup because that could already read datasets.
- mounted_dataset = self.mount_or_download_dataset(self.container.azure_dataset_id,
- self.container.local_dataset)
- if mounted_dataset is not None:
+ if self.container.azure_dataset_id:
+ mounted_dataset = azure_run_info.input_datasets[dataset_index]
+ if mounted_dataset is None:
+ mounted_dataset = self.download_or_use_existing_dataset(self.container.azure_dataset_id,
+ self.container.local_dataset)
self.container.local_dataset = mounted_dataset
-
- extra_locals = []
- if self.is_offline_run and len(self.container.extra_local_dataset_paths) != 0:
- for local in self.container.extra_local_dataset_paths:
- extra_local_dataset = self.mount_or_download_dataset(None, local)
- assert extra_local_dataset is not None # for mypy
- extra_locals.append(extra_local_dataset)
- elif len(self.container.extra_azure_dataset_ids) != 0:
- for i, azure_id in enumerate(self.container.extra_azure_dataset_ids, 1):
- extra_local_dataset = self.mount_or_download_dataset(azure_id, None, dataset_index=i)
- assert extra_local_dataset is not None # for mypy
- extra_locals.append(extra_local_dataset)
- self.container.extra_local_dataset_paths = extra_locals
+ dataset_index += 1
+ if self.container.extra_azure_dataset_ids:
+ num_extra_local_datasets = len(self.container.extra_local_dataset_paths)
+ extra_locals: List[Path] = []
+ for i, extra_azure_id in enumerate(self.container.extra_azure_dataset_ids):
+ if num_extra_local_datasets > 0 and i < (num_extra_local_datasets - 1):
+ raise ValueError(f"The model refers to an Azure dataset '{extra_azure_id}' at index {i}, "
+ f"but there are not enough local datasets given ")
+ mounted_dataset = azure_run_info.input_datasets[dataset_index]
+ if mounted_dataset is None:
+ local = None if num_extra_local_datasets == 0 else self.container.extra_local_dataset_paths[i]
+ mounted_dataset = self.download_or_use_existing_dataset(extra_azure_id, local_dataset=local)
+ extra_locals.append(mounted_dataset)
+ self.container.extra_local_dataset_paths = extra_locals
# Ensure that we use fixed seeds before initializing the PyTorch models
seed_everything(self.container.get_effective_random_seed())
# Creating the folder structure must happen before the LightningModule is created, because the output
@@ -235,10 +244,11 @@ def setup(self, use_mount_or_download_dataset: bool = True) -> None:
if self.container.pretraining_run_recovery_id is not None:
run_to_recover = self.azure_config.fetch_run(self.container.pretraining_run_recovery_id.strip())
+ only_return_path = not is_global_rank_zero()
run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.container,
run_to_recover,
EXTRA_RUN_SUBFOLDER,
- only_return_path=not is_global_rank_zero())
+ only_return_path=only_return_path)
self.container.pretraining_run_checkpoints = run_recovery_object
# A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
@@ -543,54 +553,30 @@ def create_activation_maps(self) -> None:
activation_maps.extract_activation_maps(self.innereye_config) # type: ignore
logging.info("Successfully extracted and saved activation maps")
- def mount_or_download_dataset(self,
- azure_dataset_id: Optional[str],
- local_dataset: Optional[Path],
- dataset_index: int = 0) -> Optional[Path]:
+ def download_or_use_existing_dataset(self,
+ azure_dataset_id: Optional[str],
+ local_dataset: Optional[Path]) -> Path:
"""
Makes the dataset that the model uses available on the executing machine. If the present training run is outside
of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be
taken. If a dataset is specified in `azure_dataset_id`, it will attempt to download the dataset from Azure
into the local repository, in the "datasets" folder.
- If the training run is inside of AzureML, the dataset that was specified at job submission time will be
- mounted or downloaded.
:param azure_dataset_id: id of the dataset in AML workspace
:param local_dataset: alternatively local path for this dataset
- :param index of the dataset processed
- :returns: the path of the dataset on the executing machine.
+ :returns: The path of the dataset on the executing machine.
"""
- if self.is_offline_run:
- # A dataset, either local or in Azure, is required for the built-in InnerEye models. When models are
- # specified via a LightningContainer, these dataset fields are optional, because the container datasets
- # could be downloaded even from the web.
- is_dataset_required = isinstance(self.container, InnerEyeContainer)
- # The present run is outside of AzureML: If local_dataset is set, use that as the path to the data.
- # Otherwise, download the dataset specified by the azure_dataset_id
- if is_dataset_required:
- if (not azure_dataset_id) and (local_dataset is None):
- raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
- if local_dataset:
- expected_dir = Path(local_dataset)
- if not expected_dir.is_dir():
- raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
- logging.info(f"Model training will use the local dataset provided in {expected_dir}")
- return expected_dir
- if azure_dataset_id:
- dataset_csv = ""
- if isinstance(self.model_config, DeepLearningConfig):
- dataset_csv = self.model_config.dataset_csv
- return download_dataset(azure_dataset_id=azure_dataset_id,
- target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
- dataset_csv=dataset_csv, azure_config=self.azure_config)
- return None
-
- # Inside of AzureML, datasets can be either mounted or downloaded.
+ if not self.is_offline_run:
+ raise ValueError("This function should only be called in runs outside AzureML.")
+ if local_dataset:
+ return check_dataset_folder_exists(local_dataset)
if azure_dataset_id:
- mounted = try_to_mount_input_dataset(dataset_index)
- if not mounted:
- raise ValueError("Unable to mount or download input dataset.")
- return mounted
- return None
+ dataset_csv = ""
+ if isinstance(self.model_config, DeepLearningConfig):
+ dataset_csv = self.model_config.dataset_csv
+ return download_dataset(azure_dataset_id=azure_dataset_id,
+ target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
+ dataset_csv=dataset_csv, azure_config=self.azure_config)
+ raise ValueError("The model must contain either local_dataset or azure_dataset_id")
def set_multiprocessing_start_method(self) -> None:
"""
diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py
index 2ab2c6f4b..b992a4bf7 100755
--- a/InnerEye/ML/runner.py
+++ b/InnerEye/ML/runner.py
@@ -2,12 +2,12 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
+import logging
import os
import sys
-import warnings
+import uuid
from pathlib import Path
-
-import matplotlib
+from typing import Optional, Tuple
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
# individual warnings only.
@@ -21,29 +21,29 @@
if innereye_root_str not in sys.path:
print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
sys.path.insert(0, innereye_root_str)
-# We change the current working directory before starting the actual training. However, this throws off starting
-# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
-# path.
-runner_path = Path(sys.argv[0])
-if not runner_path.is_absolute():
- sys.argv[0] = str(runner_path.absolute())
-
-import logging
-from typing import Optional, Tuple
+from InnerEye.Common import fixed_paths
+fixed_paths.add_submodules_to_path()
from azureml._base_sdk_common import user_agent
-from azureml.core import Run
+from azureml.core import Run, ScriptRunConfig
+from health.azure.himl import AzureRunInfo, submit_to_azure_if_needed
+from health.azure.azure_util import create_run_recovery_id, merge_conda_files, to_azure_friendly_string
+import matplotlib
+from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
from InnerEye.Azure import azure_util
from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
-from InnerEye.Azure.azure_runner import create_runner_parser, get_git_tags, parse_args_and_add_yaml_variables, \
- parse_arguments, set_environment_variables_for_multi_node, submit_to_azureml
-from InnerEye.Azure.azure_util import RUN_CONTEXT, get_all_environment_files, is_offline_run_context, \
- is_run_and_child_runs_completed
+from InnerEye.Azure.azure_runner import (DEFAULT_DOCKER_BASE_IMAGE, create_dataset_configs, create_experiment_name,
+ create_runner_parser,
+ get_git_tags,
+ parse_args_and_add_yaml_variables,
+ parse_arguments, additional_run_tags,
+ set_environment_variables_for_multi_node)
+from InnerEye.Azure.azure_util import (RUN_CONTEXT, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files,
+ is_offline_run_context)
from InnerEye.Azure.run_pytest import download_pytest_result, run_pytest
-from InnerEye.Common import fixed_paths
-from InnerEye.Common.common_util import FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
- append_to_amlignore, disable_logging_to_file, is_linux, logging_to_stdout
+from InnerEye.Common.common_util import (FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE,
+ disable_logging_to_file, is_linux, logging_to_stdout)
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
from InnerEye.ML.deep_learning_config import DeepLearningConfig
@@ -54,6 +54,13 @@
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.lightning_container import LightningContainer
+# We change the current working directory before starting the actual training. However, this throws off starting
+# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
+# path.
+runner_path = Path(sys.argv[0])
+if not runner_path.is_absolute():
+ sys.argv[0] = str(runner_path.absolute())
+
def initialize_rpdb() -> None:
"""
@@ -185,7 +192,7 @@ def parse_overrides_and_apply(c: object, previous_parser_result: ParserResult) -
logging.info("extra_code_directory is unset")
return parser_result
- def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]:
+ def run(self) -> Tuple[Optional[DeepLearningConfig], AzureRunInfo]:
"""
The main entry point for training and testing models from the commandline. This chooses a model to train
via a commandline argument, runs training or testing, and writes all required info to disk and logs.
@@ -201,64 +208,130 @@ def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]:
if self.lightning_container.perform_cross_validation:
# force hyperdrive usage if performing cross validation
self.azure_config.hyperdrive = True
- run_object: Optional[Run] = None
- if self.azure_config.azureml:
- run_object = self.submit_to_azureml()
- else:
- self.run_in_situ()
+ azure_run_info = self.submit_to_azureml_if_needed()
+ self.run_in_situ(azure_run_info)
if self.model_config is None:
- return self.lightning_container, run_object
- return self.model_config, run_object
+ return self.lightning_container, azure_run_info
+ return self.model_config, azure_run_info
- def submit_to_azureml(self) -> Run:
+ def submit_to_azureml_if_needed(self) -> AzureRunInfo:
"""
Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
completion and the Run did not succeed.
"""
- # The adal package creates a logging.info line each time it gets an authentication token, avoid that.
- logging.getLogger('adal-python').setLevel(logging.WARNING)
- # Azure core prints full HTTP requests even in INFO mode
- logging.getLogger('azure').setLevel(logging.WARNING)
- # PyJWT prints out warnings that are beyond our control
- warnings.filterwarnings("ignore", category=DeprecationWarning)
- if isinstance(self.model_config, DeepLearningConfig) and not self.lightning_container.azure_dataset_id:
+ if self.azure_config.azureml and isinstance(self.model_config, DeepLearningConfig) \
+ and not self.lightning_container.azure_dataset_id:
raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
"property must be set.")
source_config = SourceConfig(
root_folder=self.project_root,
entry_script=Path(sys.argv[0]).resolve(),
+ script_params=sys.argv[1:],
conda_dependencies_files=get_all_environment_files(self.project_root),
hyperdrive_config_func=(self.model_config.get_hyperdrive_config if self.model_config
else self.lightning_container.get_hyperdrive_config),
# For large jobs, upload of results can time out because of large checkpoint files. Default is 600
upload_timeout_seconds=86400
)
- source_config.set_script_params_except_submit_flag()
# Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed
# when running pytest.
ignored_folders = []
if not self.azure_config.pytest_mark:
- ignored_folders.extend(["Tests", "TestsOutsidePackage", "TestSubmodule"])
+ ignored_folders.extend(["Tests", "TestsOutsidePackage"])
if not self.lightning_container.regression_test_folder:
ignored_folders.append("RegressionTestResults")
- with append_to_amlignore(ignored_folders):
- azure_run = submit_to_azureml(self.azure_config, source_config,
- self.lightning_container.all_azure_dataset_ids(),
- self.lightning_container.all_dataset_mountpoints())
- logging.info("Job submission to AzureML done.")
- if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
- # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
- # A build step will pick up that file and publish it to Azure DevOps.
- # If pytest_mark is set, this file must exist.
- logging.info("Downloading pytest result file.")
- download_pytest_result(azure_run)
- else:
- logging.info("No pytest_mark present, hence not downloading the pytest result file.")
- # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state.
- if self.azure_config.wait_for_completion and not is_run_and_child_runs_completed(azure_run):
- raise ValueError(f"Run {azure_run.id} in experiment {azure_run.experiment.name} or one of its child "
- "runs failed.")
- return azure_run
+
+ input_datasets = create_dataset_configs(self.azure_config,
+ all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(),
+ all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints())
+
+
+ def after_submission_hook(azure_run: Run) -> None:
+ """
+ A function that will be called right after job submission.
+ """
+ # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
+ # run in cross validation analysis
+ recovery_id = create_run_recovery_id(azure_run)
+ azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id)
+ print("If this run fails, re-start runner.py and supply these additional arguments: "
+ f"--run_recovery_id={recovery_id}")
+ if self.azure_config.tensorboard:
+ print("Starting TensorBoard now because you specified --tensorboard")
+ monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[azure_run.id]),
+ azure_config=self.azure_config)
+ else:
+ print(f"To monitor this run locally using TensorBoard, run the script: "
+ f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")
+
+ if self.azure_config.wait_for_completion:
+ # We want the job output to be visible on the console, but the program should not exit if the
+ # job fails because we need to download the pytest result file.
+ azure_run.wait_for_completion(show_output=True, raise_on_error=False)
+ if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
+ # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
+ # A build step will pick up that file and publish it to Azure DevOps.
+ # If pytest_mark is set, this file must exist.
+ logging.info("Downloading pytest result file.")
+ download_pytest_result(azure_run)
+
+ hyperdrive_config = None
+ if self.azure_config.hyperdrive:
+ hyperdrive_config = self.lightning_container.get_hyperdrive_config(ScriptRunConfig(source_directory=""))
+
+ # Create a temporary file for the merged conda file, that will be removed after submission of the job.
+ temp_conda: Optional[Path] = None
+ try:
+ if len(source_config.conda_dependencies_files) > 1:
+ temp_conda = source_config.root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml"
+ # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not
+ # be necessary if the innereye package is installed. It is necessary when working with an outer project
+ # and InnerEye as a git submodule and submitting jobs from the local machine.
+ # In case of version conflicts, the package version in the outer project is given priority.
+ merge_conda_files(source_config.conda_dependencies_files, temp_conda)
+
+ # Calls like `self.azure_config.get_workspace()` will fail if we have no AzureML credentials set up, and so
+ # we should only attempt them if we intend to elevate this to AzureML
+ if self.azure_config.azureml:
+ if not self.azure_config.cluster:
+ raise ValueError("self.azure_config.cluster not set, but we need a compute_cluster_name to submit"
+ "the script to run in AzureML")
+ azure_run_info = submit_to_azure_if_needed(
+ entry_script=source_config.entry_script,
+ snapshot_root_directory=source_config.root_folder,
+ script_params=source_config.script_params,
+ conda_environment_file=temp_conda or source_config.conda_dependencies_files[0],
+ aml_workspace=self.azure_config.get_workspace(),
+ compute_cluster_name=self.azure_config.cluster,
+ environment_variables=source_config.environment_variables,
+ default_datastore=self.azure_config.azureml_datastore,
+ experiment_name=to_azure_friendly_string(create_experiment_name(self.azure_config)),
+ max_run_duration=self.azure_config.max_run_duration,
+ input_datasets=input_datasets,
+ num_nodes=self.azure_config.num_nodes,
+ wait_for_completion=False,
+ ignored_folders=ignored_folders,
+ pip_extra_index_url=self.azure_config.pip_extra_index_url,
+ submit_to_azureml=self.azure_config.azureml,
+ docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
+ docker_shm_size=self.azure_config.docker_shm_size,
+ tags=additional_run_tags(
+ azure_config=self.azure_config,
+ commandline_args=" ".join(source_config.script_params)),
+ after_submission=after_submission_hook,
+ hyperdrive_config=hyperdrive_config)
+ else:
+ # compute_cluster_name is a required parameter in early versions of the HI-ML package
+ azure_run_info = submit_to_azure_if_needed(
+ input_datasets=input_datasets,
+ submit_to_azureml=False,
+ compute_cluster_name="")
+ finally:
+ if temp_conda:
+ temp_conda.unlink()
+ # submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running
+ # the script locally or in AzureML.
+ return azure_run_info
def print_git_tags(self) -> None:
"""
@@ -278,9 +351,11 @@ def print_git_tags(self) -> None:
for key, value in tags_to_print.items():
logging.info(f" {key:20}: {value}")
- def run_in_situ(self) -> None:
+ def run_in_situ(self, azure_run_info: AzureRunInfo) -> None:
"""
Actually run the AzureML job; this method will typically run on an Azure VM.
+ :param azure_run_info: Contains all information about the present run in AzureML, in particular where the
+ datasets are mounted.
"""
# Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
# build itself, but not the tons of debug information that AzureML submissions create.
@@ -305,7 +380,7 @@ def run_in_situ(self) -> None:
# if it detects that it is not in a multi-node environment.
set_environment_variables_for_multi_node()
ml_runner = self.create_ml_runner()
- ml_runner.setup()
+ ml_runner.setup(azure_run_info)
ml_runner.start_logging_to_file()
try:
ml_runner.run()
diff --git a/InnerEye/Scripts/submit_for_inference.py b/InnerEye/Scripts/submit_for_inference.py
index 996502418..46df212b4 100755
--- a/InnerEye/Scripts/submit_for_inference.py
+++ b/InnerEye/Scripts/submit_for_inference.py
@@ -11,15 +11,15 @@
import param
import requests
-from azureml.core import Experiment, Model
+from azureml.core import Model, ScriptRunConfig
-from InnerEye.Azure.azure_config import AzureConfig, SourceConfig
-from InnerEye.Azure.azure_runner import create_run_config
+from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Common.common_util import logging_to_stdout
from InnerEye.Common.fixed_paths import DEFAULT_DATA_FOLDER, DEFAULT_RESULT_IMAGE_NAME, DEFAULT_RESULT_ZIP_DICOM_NAME, \
- DEFAULT_TEST_IMAGE_NAME, DEFAULT_TEST_ZIP_NAME, ENVIRONMENT_YAML_FILE_NAME, RUN_SCORING_SCRIPT, SCORE_SCRIPT, \
- SETTINGS_YAML_FILE, repository_root_directory, PYTHON_ENVIRONMENT_NAME
+ DEFAULT_TEST_IMAGE_NAME, DEFAULT_TEST_ZIP_NAME, ENVIRONMENT_YAML_FILE_NAME, PYTHON_ENVIRONMENT_NAME, \
+ RUN_SCORING_SCRIPT, SCORE_SCRIPT, SETTINGS_YAML_FILE, repository_root_directory
from InnerEye.Common.generic_parsing import GenericConfig
+from health.azure.himl import create_run_configuration, submit_run
class SubmitForInferenceConfig(GenericConfig):
@@ -29,8 +29,9 @@ class SubmitForInferenceConfig(GenericConfig):
experiment_name: str = param.String(default="model_inference",
doc="Name of experiment the run should belong to")
model_id: str = param.String(doc="Id of model, e.g. Prostate:123. Mandatory.")
- image_file: Path = param.ClassSelector(class_=Path, doc="Image file to segment, ending in .nii.gz if use_dicom=False, "
- "or zip of a DICOM series otherwise. Mandatory.")
+ image_file: Path = param.ClassSelector(class_=Path,
+ doc="Image file to segment, ending in .nii.gz if use_dicom=False, "
+ "or zip of a DICOM series otherwise. Mandatory.")
settings: Path = param.ClassSelector(class_=Path,
doc="File containing Azure settings (typically your settings.yml). If not "
"provided, use the default settings file.")
@@ -66,7 +67,8 @@ def validate(self) -> None:
def copy_image_file(image: Path, destination_folder: Path, use_dicom: bool) -> Path:
"""
Copy the source image file into the given folder destination_folder.
- :param image: image file, must be Gzipped Nifti format with name ending .nii.gz if use_dicom=False or .zip otherwise.
+ :param image: image file, must be Gzipped Nifti format with name ending .nii.gz if use_dicom=False or .zip
+ otherwise.
:param destination_folder: top-level directory to copy image into (as test.nii.gz or test.zip)
:param use_dicom: True to treat as a zip file.
:return: The full path of the image in the destination_folder
@@ -154,43 +156,45 @@ def submit_for_inference(args: SubmitForInferenceConfig, azure_config: AzureConf
# clash.
temp_folder = source_directory_path / "temp_for_scoring"
conda_files = download_files_from_model(model_sas_urls, ENVIRONMENT_YAML_FILE_NAME, dir_path=temp_folder)
- if not conda_files:
- raise ValueError("At least 1 Conda environment definition must exist in the model.")
+ if len(conda_files) != 1:
+ raise ValueError("Exactly 1 Conda environment definition must exist in the model.")
# Retrieve the name of the Python environment that the training run used. This environment should have been
# registered. If no such environment exists, it will be re-create from the Conda files provided.
python_environment_name = model.tags.get(PYTHON_ENVIRONMENT_NAME, "")
+ if not python_environment_name:
+ raise ValueError(f"The model did not contain tag {PYTHON_ENVIRONMENT_NAME} for the AzureML environment to use.")
# Copy the scoring script from the repository. This will start the model download from Azure, and invoke the
# scoring script.
entry_script = source_directory_path / Path(RUN_SCORING_SCRIPT).name
shutil.copyfile(str(repository_root_directory(RUN_SCORING_SCRIPT)),
str(entry_script))
- source_config = SourceConfig(
- root_folder=source_directory_path,
- entry_script=entry_script,
- script_params=["--model-folder", ".",
- "--model-id", model_id,
- SCORE_SCRIPT,
- # The data folder must be relative to the root folder of the AzureML job. test_image_files
- # is then just the file relative to the data_folder
- "--data_folder", image.parent.name,
- "--image_files", image.name,
- "--use_dicom", str(args.use_dicom),
- "--model_id", model_id],
- conda_dependencies_files=conda_files,
+ run_config = create_run_configuration(workspace=azure_config.get_workspace(),
+ compute_cluster_name=azure_config.cluster,
+ aml_environment_name=python_environment_name)
+ script_run_config = ScriptRunConfig(
+ source_directory=str(source_directory_path),
+ script=entry_script.relative_to(source_directory_path),
+ arguments=["--model-folder", ".",
+ "--model-id", model_id,
+ SCORE_SCRIPT,
+ # The data folder must be relative to the root folder of the AzureML
+ # job. image_files is then just the file relative to the data_folder
+ "--data_folder", image.parent.name,
+ "--image_files", image.name,
+ "--use_dicom", str(args.use_dicom),
+ "--model_id", model_id],
+ run_config=run_config
)
- run_config = create_run_config(azure_config, source_config, environment_name=python_environment_name,
- all_azure_dataset_ids=[], all_dataset_mountpoints=[])
- exp = Experiment(workspace=workspace, name=args.experiment_name)
- run = exp.submit(run_config)
- logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}")
- logging.info(f"Run URL: {run.get_portal_url()}")
+
+ run = submit_run(workspace=workspace,
+ experiment_name=args.experiment_name,
+ script_run_config=script_run_config,
+ wait_for_completion=True)
if not args.keep_upload_folder:
source_directory.cleanup()
logging.info(f"Deleted submission directory {source_directory_path}")
if args.download_folder is None:
return None
- logging.info("Awaiting run completion")
- run.wait_for_completion()
logging.info(f"Run has completed with status {run.get_status()}")
download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if args.use_dicom else DEFAULT_RESULT_IMAGE_NAME
download_path = choose_download_path(download_file, args.download_folder)
diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
index 27b0b0240..3ccf96520 100644
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@@ -20,9 +20,9 @@
import pytest
from azureml._restclient.constants import RunStatus
from azureml.core import Model, Run
+from health.azure.himl import RUN_RECOVERY_FILE
from InnerEye.Azure.azure_config import AzureConfig
-from InnerEye.Azure.azure_runner import RUN_RECOVERY_FILE
from InnerEye.Azure.azure_util import MODEL_ID_KEY_NAME, download_run_output_file, download_run_outputs_by_prefix, \
get_comparison_baseline_paths, \
is_running_on_azure_agent, to_azure_friendly_string
@@ -101,6 +101,7 @@ def get_most_recent_model_id(fallback_run_id_for_local_execution: str = FALLBACK
azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE,
project_root=fixed_paths.repository_root_directory())
run = azure_config.fetch_run(most_recent_run)
+ assert run.status == "Completed", f"AzureML run {run.id} did not complete successfully."
tags = run.get_tags()
model_id = tags.get(MODEL_ID_KEY_NAME, None)
assert model_id, f"No model_id tag was found on run {most_recent_run}"
@@ -209,40 +210,41 @@ def test_check_dataset_mountpoint(test_output_dirs: OutputFolderForTests) -> Non
@pytest.mark.inference
-@pytest.mark.parametrize("use_dicom", [False, True])
-def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderForTests) -> None:
+def test_submit_for_inference(test_output_dirs: OutputFolderForTests) -> None:
"""
Execute the submit_for_inference script on the model that was recently trained. This starts an AzureML job,
and downloads the segmentation. Then check if the segmentation was actually produced.
-
- :param use_dicom: True to test DICOM in/out, False otherwise.
:param test_output_dirs: Test output directories.
"""
model = get_most_recent_model(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
assert PYTHON_ENVIRONMENT_NAME in model.tags, "Environment name not present in model properties"
- if use_dicom:
- size = (64, 64, 64)
- spacing = (1., 1., 2.5)
- image_file = test_output_dirs.root_dir / "temp_pack_dicom_series" / "dicom_series.zip"
- scratch_folder = test_output_dirs.root_dir / "temp_dicom_series"
- zip_random_dicom_series(size, spacing, image_file, scratch_folder)
- else:
- image_file = fixed_paths_for_tests.full_ml_test_data_path() / "train_and_test_data" / "id1_channel1.nii.gz"
- assert image_file.exists(), f"Image file not found: {image_file}"
- settings_file = fixed_paths.SETTINGS_YAML_FILE
- assert settings_file.exists(), f"Settings file not found: {settings_file}"
- args = ["--image_file", str(image_file),
- "--model_id", model.id,
- "--settings", str(settings_file),
- "--download_folder", str(test_output_dirs.root_dir),
- "--cluster", "training-nc12",
- "--experiment", get_experiment_name_from_environment() or "model_inference",
- "--use_dicom", str(use_dicom)]
- download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if use_dicom else DEFAULT_RESULT_IMAGE_NAME
- seg_path = test_output_dirs.root_dir / download_file
- assert not seg_path.exists(), f"Result file {seg_path} should not yet exist"
- submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
- assert seg_path.exists(), f"Result file {seg_path} was not created"
+ # Both parts of this test rely on the same model that was trained in a previous run. If these tests are executed
+ # independently (via pytest.mark.parametrize), get_most_recent_model would pick up the AML run that the
+ # previously executed part of this test submitted.
+ for use_dicom in [False, True]:
+ if use_dicom:
+ size = (64, 64, 64)
+ spacing = (1., 1., 2.5)
+ image_file = test_output_dirs.root_dir / "temp_pack_dicom_series" / "dicom_series.zip"
+ scratch_folder = test_output_dirs.root_dir / "temp_dicom_series"
+ zip_random_dicom_series(size, spacing, image_file, scratch_folder)
+ else:
+ image_file = fixed_paths_for_tests.full_ml_test_data_path() / "train_and_test_data" / "id1_channel1.nii.gz"
+ assert image_file.exists(), f"Image file not found: {image_file}"
+ settings_file = fixed_paths.SETTINGS_YAML_FILE
+ assert settings_file.exists(), f"Settings file not found: {settings_file}"
+ args = ["--image_file", str(image_file),
+ "--model_id", model.id,
+ "--settings", str(settings_file),
+ "--download_folder", str(test_output_dirs.root_dir),
+ "--cluster", "training-nc12",
+ "--experiment", get_experiment_name_from_environment() or "model_inference",
+ "--use_dicom", str(use_dicom)]
+ download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if use_dicom else DEFAULT_RESULT_IMAGE_NAME
+ seg_path = test_output_dirs.root_dir / download_file
+ assert not seg_path.exists(), f"Result file {seg_path} should not yet exist"
+ submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
+ assert seg_path.exists(), f"Result file {seg_path} was not created"
def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool:
@@ -375,9 +377,7 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
# Check diagnostic messages that show if DDP was set up correctly. This could fail if Lightning
# changes its diagnostic outputs.
assert "initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4" in log0_txt
- assert "initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/4" in log0_txt
assert "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
- assert "initializing ddp: GLOBAL_RANK: 3, MEMBER: 4/4" in log1_txt
@pytest.mark.skip("The recovery job hangs after completing on AML")
@@ -395,8 +395,10 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
"--tag", "recovery_on_2_nodes"
]
script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
- with mock.patch("sys.argv", [script] + args_list):
- main()
+ # Submission of the recovery job will try to exit the process, catch that and check the submitted run.
+ with pytest.raises(SystemExit):
+ with mock.patch("sys.argv", [script] + args_list):
+ main()
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
assert run.status == RunStatus.COMPLETED
files = run.get_file_names()
diff --git a/Tests/Azure/test_azure_config.py b/Tests/Azure/test_azure_config.py
index 810abee04..7fc4e300d 100644
--- a/Tests/Azure/test_azure_config.py
+++ b/Tests/Azure/test_azure_config.py
@@ -5,7 +5,7 @@
import pytest
from InnerEye.Azure.azure_config import AzureConfig
-from InnerEye.Azure.azure_runner import create_dataset_consumptions
+from InnerEye.Azure.azure_runner import create_dataset_configs
from Tests.ML.util import get_default_azure_config
@@ -20,7 +20,7 @@ def test_dataset_consumption1() -> None:
Test that an empty dataset ID will not produce any dataset consumption.
"""
azure_config = get_default_azure_config()
- assert len(create_dataset_consumptions(azure_config, [""], [""])) == 0
+ assert len(create_dataset_configs(azure_config, [""], [""])) == 0
def test_dataset_consumption2() -> None:
@@ -29,7 +29,7 @@ def test_dataset_consumption2() -> None:
"""
azure_config = get_default_azure_config()
with pytest.raises(ValueError) as ex:
- create_dataset_consumptions(azure_config, [""], ["foo"])
+ create_dataset_configs(azure_config, [""], ["foo"])
assert "but a mount point has been provided" in str(ex)
@@ -38,7 +38,7 @@ def test_dataset_consumption3() -> None:
Test that a matching number of mount points is created.
"""
azure_config = get_default_azure_config()
- assert len(create_dataset_consumptions(azure_config, ["test-dataset", "test-dataset"], [])) == 2
+ assert len(create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], [])) == 2
def test_dataset_consumption4() -> None:
@@ -47,15 +47,5 @@ def test_dataset_consumption4() -> None:
"""
azure_config = get_default_azure_config()
with pytest.raises(ValueError) as ex:
- create_dataset_consumptions(azure_config, ["test-dataset", "test-dataset"], ["foo"])
+ create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], ["foo"])
assert "must equal the number of Azure dataset IDs" in str(ex)
-
-
-def test_dataset_consumption5() -> None:
- """
- Test error handling for empty dataset IDs.
- """
- azure_config = get_default_azure_config()
- with pytest.raises(ValueError) as ex:
- azure_config.get_or_create_dataset("")
- assert "No dataset ID provided" in str(ex)
diff --git a/Tests/Azure/test_azure_util.py b/Tests/Azure/test_azure_util.py
index 9693a59a1..d80f3f041 100644
--- a/Tests/Azure/test_azure_util.py
+++ b/Tests/Azure/test_azure_util.py
@@ -2,25 +2,23 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
-import time
from pathlib import Path
-from typing import Iterator, Set
import pytest
from azureml.core import Run
from azureml.core.workspace import Workspace
-from InnerEye.Azure.azure_config import AzureConfig, SourceConfig
-from InnerEye.Azure.azure_runner import create_experiment_name, get_or_create_python_environment
+from InnerEye.Azure.azure_config import AzureConfig
+from InnerEye.Azure.azure_runner import create_experiment_name
from InnerEye.Azure.azure_util import DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, fetch_child_runs, fetch_run, \
- get_cross_validation_split_index, is_cross_validation_child_run, is_run_and_child_runs_completed, \
- merge_conda_dependencies, merge_conda_files, to_azure_friendly_container_path
-from InnerEye.Common.common_util import logging_to_stdout, is_linux
+ get_cross_validation_split_index, is_cross_validation_child_run, \
+ to_azure_friendly_container_path
+from InnerEye.Common.common_util import logging_to_stdout
from InnerEye.Common.fixed_paths import PRIVATE_SETTINGS_FILE, PROJECT_SECRETS_FILE, \
- get_environment_yaml_file, repository_root_directory
-from InnerEye.Common.output_directories import OutputFolderForTests
+ repository_root_directory
from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, get_most_recent_run, get_most_recent_run_id
-from Tests.ML.util import get_default_azure_config, get_default_workspace
+from Tests.ML.util import get_default_workspace
+from health.azure.azure_util import is_run_and_child_runs_completed
def test_os_path_to_azure_friendly_container_path() -> None:
@@ -85,66 +83,6 @@ def test_is_cross_validation_child_run_ensemble_run() -> None:
assert all([is_cross_validation_child_run(x) for x in fetch_child_runs(run)])
-@pytest.mark.skipif(is_linux(), reason="Spurious file read/write errors on linux build agents.")
-def test_merge_conda(test_output_dirs: OutputFolderForTests) -> None:
- """
- Tests the logic for merging Conda environment files.
- """
- env1 = """
-channels:
- - defaults
- - pytorch
-dependencies:
- - conda1=1.0
- - conda2=2.0
- - conda_both=3.0
- - pip:
- - azureml-sdk==1.7.0
- - foo==1.0
-"""
- env2 = """
-channels:
- - defaults
-dependencies:
- - conda1=1.1
- - conda_both=3.0
- - pip:
- - azureml-sdk==1.6.0
- - bar==2.0
-"""
- # Spurious test failures on Linux build agents, saying that they can't write the file. Wait a bit.
- time.sleep(0.5)
- file1 = test_output_dirs.root_dir / "env1.yml"
- file1.write_text(env1)
- file2 = test_output_dirs.root_dir / "env2.yml"
- file2.write_text(env2)
- # Spurious test failures on Linux build agents, saying that they can't read the file. Wait a bit.
- time.sleep(0.5)
- files = [file1, file2]
- merged_file = test_output_dirs.root_dir / "merged.yml"
- merge_conda_files(files, merged_file)
- assert merged_file.read_text().splitlines() == """channels:
-- defaults
-- pytorch
-dependencies:
-- conda1=1.0
-- conda1=1.1
-- conda2=2.0
-- conda_both=3.0
-- pip:
- - azureml-sdk==1.6.0
- - azureml-sdk==1.7.0
- - bar==2.0
- - foo==1.0
-""".splitlines()
- conda_dep, _ = merge_conda_dependencies(files)
- # We expect to see the union of channels.
- assert list(conda_dep.conda_channels) == ["defaults", "pytorch"]
- # Package version conflicts are not resolved, both versions are retained.
- assert list(conda_dep.conda_packages) == ["conda1=1.0", "conda1=1.1", "conda2=2.0", "conda_both=3.0"]
- assert list(conda_dep.pip_packages) == ["azureml-sdk==1.6.0", "azureml-sdk==1.7.0", "bar==2.0", "foo==1.0"]
-
-
def test_experiment_name() -> None:
c = AzureConfig()
c.build_branch = "branch"
@@ -196,29 +134,3 @@ def test_amlignore() -> None:
test_variables = repository_root_directory(PROJECT_SECRETS_FILE)
if test_variables.is_file():
assert PROJECT_SECRETS_FILE in ignored, f"{PROJECT_SECRETS_FILE} is not in .amlignore"
-
-
-def test_create_python_env() -> None:
- """
- Checks if environment variables in the SourceConfig are correctly passed through to the Python environment.
- Environment variables in SourceConfig are only used in the internal InnerEye repo.
- :return:
- """
- foo = "foo"
- bar = "bar"
- entry_script = Path("something.py")
- conda_file = get_environment_yaml_file()
- s = SourceConfig(root_folder=Path(""), entry_script=entry_script, conda_dependencies_files=[conda_file],
- environment_variables={foo: bar})
- env = get_or_create_python_environment(source_config=s,
- azure_config=get_default_azure_config(),
- register_environment=False)
- assert foo in env.environment_variables
- assert env.environment_variables[foo] == bar
-
- # Check that some of the basic packages that we expect to always exist are picked up correctly in the Conda env
- def remove_version_number(items: Iterator[str]) -> Set[str]:
- return set(c.split("=")[0] for c in items)
-
- assert "pytorch" in remove_version_number(env.python.conda_dependencies.conda_packages)
- assert "pytorch-lightning" in remove_version_number(env.python.conda_dependencies.pip_packages)
diff --git a/Tests/Azure/test_parsing.py b/Tests/Azure/test_parsing.py
index 7f5562273..00d248fd7 100644
--- a/Tests/Azure/test_parsing.py
+++ b/Tests/Azure/test_parsing.py
@@ -3,13 +3,12 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from enum import Enum
-from pathlib import Path
from typing import Any, Optional
from unittest import mock
import pytest
-from InnerEye.Azure.azure_config import AZURECONFIG_SUBMIT_TO_AZUREML, AzureConfig, SourceConfig
+from InnerEye.Azure.azure_config import AZURECONFIG_SUBMIT_TO_AZUREML, AzureConfig
from InnerEye.Azure.azure_runner import create_runner_parser, parse_args_and_add_yaml_variables, \
run_duration_string_to_seconds
from InnerEye.Azure.parser_util import _is_empty_or_empty_string_list
@@ -134,37 +133,6 @@ def test_azureml_submit_constant() -> None:
assert hasattr(azure_config, AZURECONFIG_SUBMIT_TO_AZUREML)
-def test_source_config_set_params() -> None:
- """
- Check that commandline arguments are set correctly when submitting the script to AzureML.
- In particular, the azureml flag should be omitted, irrespective of how the argument is written.
- """
- s = SourceConfig(root_folder=Path(""), entry_script=Path("something.py"), conda_dependencies_files=[])
-
- def assert_has_params(expected_args: str) -> None:
- assert s.script_params is not None
- # Arguments are in the keys of the dictionary only, and should have been added in the right order
- assert " ".join(s.script_params) == expected_args
-
- with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}=True", "--more"]):
- s.set_script_params_except_submit_flag()
- assert_has_params("some --param 1 --more")
- with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "False", "--more"]):
- s.set_script_params_except_submit_flag()
- assert_has_params("some --param 1 --more")
- # Using the new syntax for boolean flags
- with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "--more"]):
- s.set_script_params_except_submit_flag()
- assert_has_params("some --param 1 --more")
- with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}"]):
- s.set_script_params_except_submit_flag()
- assert_has_params("some --param 1")
- # Arguments where azureml is just the prefix should not be removed.
- with mock.patch("sys.argv", ["", "some", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}foo", "False", "--more"]):
- s.set_script_params_except_submit_flag()
- assert_has_params(f"some --{AZURECONFIG_SUBMIT_TO_AZUREML}foo False --more")
-
-
@pytest.mark.parametrize(["s", "expected"],
[
("1s", 1),
diff --git a/Tests/Common/test_commandline_parsing.py b/Tests/Common/test_commandline_parsing.py
index 2cc3fbb42..df6779e92 100644
--- a/Tests/Common/test_commandline_parsing.py
+++ b/Tests/Common/test_commandline_parsing.py
@@ -15,6 +15,7 @@
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.runner import Runner
from Tests.ML.configs.DummyModel import DummyModel
+from health.azure.himl import AzureRunInfo
@pytest.mark.parametrize("is_container", [True, False])
@@ -55,12 +56,19 @@ def test_create_ml_runner_args(is_container: bool,
with mock.patch("sys.argv", [""] + args_list):
with mock.patch("InnerEye.ML.deep_learning_config.is_offline_run_context", return_value=is_offline_run):
with mock.patch("InnerEye.ML.run_ml.MLRunner.run", return_value=None):
- with mock.patch("InnerEye.ML.run_ml.MLRunner.mount_or_download_dataset", return_value=dataset_folder):
+ with mock.patch("InnerEye.ML.run_ml.MLRunner.download_or_use_existing_dataset",
+ return_value=dataset_folder):
runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
runner.parse_and_load_model()
# Only when calling config.create_filesystem we expect to see the correct paths, and this happens
# inside run_in_situ
- runner.run_in_situ()
+ azure_run_info = AzureRunInfo(input_datasets=[None],
+ output_datasets=[None],
+ run=None,
+ is_running_in_azure=False,
+ output_folder=Path.cwd(),
+ logs_folder=Path.cwd())
+ runner.run_in_situ(azure_run_info)
azure_config = runner.azure_config
container_or_legacy_config = runner.lightning_container if is_container else runner.model_config
assert azure_config.model == model_name
diff --git a/Tests/Common/test_util.py b/Tests/Common/test_util.py
index fb4f5c25c..0eda9591e 100644
--- a/Tests/Common/test_util.py
+++ b/Tests/Common/test_util.py
@@ -3,13 +3,15 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import os
+import sys
from pathlib import Path
import pytest
-from InnerEye.Common import common_util, fixed_paths
-from InnerEye.Common.common_util import (append_to_amlignore, change_working_directory, check_is_any_of,
+from InnerEye.Common import common_util
+from InnerEye.Common.common_util import (change_working_directory, check_is_any_of,
is_private_field_name, namespace_to_path, path_to_namespace, print_exception)
+from InnerEye.Common.fixed_paths import add_submodules_to_path, repository_root_directory
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path, tests_root_directory
from InnerEye.Common.output_directories import OutputFolderForTests
@@ -125,19 +127,15 @@ def test_change_dir(test_output_dirs: OutputFolderForTests) -> None:
assert (new_dir / "bar.txt").is_file()
-def test_modify_amlignore() -> None:
- """
- Test that we can change the .AMLignore file and change it back to what it was before.
- """
- folder1 = "Added1"
- folder2 = "Added2"
- added_folders = [folder1, folder2]
- amlignore = fixed_paths.repository_root_directory(".amlignore")
- old_contents = amlignore.read_text()
- for f in added_folders:
- assert f not in old_contents
- with append_to_amlignore(added_folders):
- new_contents = amlignore.read_text()
- for f in added_folders:
- assert f in new_contents
- assert amlignore.read_text() == old_contents
+def test_add_submodules_to_path() -> None:
+ original_sys_path = sys.path
+ try:
+ fastmri_folder = repository_root_directory() / "fastMRI"
+ fastmri_str = str(fastmri_folder)
+ assert fastmri_folder.is_dir()
+ if fastmri_str in sys.path:
+ sys.path.remove(fastmri_str)
+ add_submodules_to_path()
+ assert fastmri_str in sys.path
+ finally:
+ sys.path = original_sys_path
diff --git a/Tests/ML/configs/fastmri_random.py b/Tests/ML/configs/fastmri_random.py
index 39b94a6d3..a0f6d3866 100644
--- a/Tests/ML/configs/fastmri_random.py
+++ b/Tests/ML/configs/fastmri_random.py
@@ -13,18 +13,13 @@
from _pytest.monkeypatch import MonkeyPatch
from pytorch_lightning import LightningDataModule, LightningModule
-from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
from InnerEye.ML.configs.other.fastmri_varnet import VarNetWithImageLogging
from InnerEye.ML.lightning_container import LightningContainer
-
-add_folder_to_sys_path_if_needed("fastMRI")
-
+from fastMRI.tests.create_temp_data import create_temp_data
from fastmri.data import SliceDataset
from fastmri.data.subsample import create_mask_for_mask_type
from fastmri.data.transforms import VarNetDataTransform
from fastmri.pl_modules import FastMriDataModule
-# This import can fail if written as "from tests.create_temp_data, even though fastMRI is already in the path.
-from fastMRI.tests.create_temp_data import create_temp_data
class FastMriRandomData(FastMriDataModule):
diff --git a/Tests/ML/models/test_instantiate_models.py b/Tests/ML/models/test_instantiate_models.py
index 459e254b1..d8e3f6582 100644
--- a/Tests/ML/models/test_instantiate_models.py
+++ b/Tests/ML/models/test_instantiate_models.py
@@ -7,11 +7,9 @@
from unittest import mock
import pytest
-from azureml.core import Run
from InnerEye.Common.common_util import logging_to_stdout, namespace_to_path
from InnerEye.Common.output_directories import OutputFolderForTests
-from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling, generate_and_print_model_summary
from Tests.ML.configs.DummyModel import DummyModel
@@ -28,7 +26,8 @@ def find_models() -> List[str]:
path = namespace_to_path(ModelConfigLoader.get_default_search_module())
folders = [path / "segmentation", path / "classification", path / "regression"]
names = [str(f.stem) for folder in folders for f in folder.glob("*.py") if folder.exists()]
- return [name for name in names if not (name.endswith("Base") or name.endswith("Paper")) and not name.startswith("__")]
+ return [name for name in names if
+ not (name.endswith("Base") or name.endswith("Paper")) and not name.startswith("__")]
def test_any_models_found() -> None:
@@ -128,27 +127,6 @@ class MockDatasetConsumption:
name = "dummy"
-@pytest.mark.parametrize("container_name", ["DummyContainerWithAzureDataset",
- "DummyContainerWithoutDataset",
- "DummyContainerWithLocalDataset",
- "DummyContainerWithAzureAndLocalDataset"])
-def test_submit_container_to_azureml(container_name: str) -> None:
- """
- Test if we can get the config loader to load a Lightning container model, and get it through the AzureML
- submission process.
- """
- runner = default_runner()
- mock_run = Run.get_context()
- args = ["", f"--model={container_name}", "--azureml=True", "--model_configs_namespace=Tests.ML.configs"]
- with mock.patch("sys.argv", args):
- with mock.patch("InnerEye.Azure.azure_config.AzureConfig.get_dataset_consumption",
- return_value=MockDatasetConsumption):
- with mock.patch("azureml.core.Experiment.submit", return_value=mock_run):
- loaded_config, actual_run = runner.run()
- assert actual_run == mock_run
- assert isinstance(runner.lightning_container, LightningContainer)
-
-
def test_load_container_with_arguments() -> None:
"""
Test if we can load a container and override a value in it via the commandline. Parameters can only be set at
diff --git a/Tests/ML/models/test_scalar_model.py b/Tests/ML/models/test_scalar_model.py
index 8f3ae5b30..6950e36da 100644
--- a/Tests/ML/models/test_scalar_model.py
+++ b/Tests/ML/models/test_scalar_model.py
@@ -377,7 +377,7 @@ def test_runner_restart(test_output_dirs: OutputFolderForTests) -> None:
model_config.recovery_checkpoint_save_interval = 1
model_config.recovery_checkpoints_save_last_k = -1
runner = MLRunner(model_config=model_config)
- runner.setup(use_mount_or_download_dataset=False)
+ runner.setup()
# Epochs are 0 based for saving
create_model_and_store_checkpoint(model_config,
runner.container.checkpoint_folder / f"{RECOVERY_CHECKPOINT_FILE_NAME}_epoch="
diff --git a/Tests/ML/runners/test_runner.py b/Tests/ML/runners/test_runner.py
index daaa2ae08..8c77222c3 100644
--- a/Tests/ML/runners/test_runner.py
+++ b/Tests/ML/runners/test_runner.py
@@ -396,14 +396,16 @@ def test_cross_validation_for_lighting_container_models_is_supported() -> None:
to azure_runner's submit_to_azureml method.
"""
args_list = ["--model=HelloContainer", "--number_of_cross_validation_splits=5", "--azureml=True"]
+ mock_run = mock.MagicMock()
+ mock_run.id = "foo"
+ mock_run.experiment.name = "bar"
with mock.patch("sys.argv", [""] + args_list):
runner = Runner(project_root=fixed_paths.repository_root_directory(),
yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
- with mock.patch("InnerEye.Azure.azure_runner.create_and_submit_experiment",
- return_value=None) as create_and_submit_experiment_patch:
- runner.run()
+ with mock.patch("health.azure.himl.submit_run", return_value=mock_run) as create_and_submit_experiment_patch:
+ with pytest.raises(SystemExit):
+ runner.run()
assert runner.lightning_container.model_name == 'HelloContainer'
assert runner.lightning_container.number_of_cross_validation_splits == 5
- args, _ = create_and_submit_experiment_patch.call_args
- script_run_config = args[1]
- assert isinstance(script_run_config, HyperDriveConfig)
+ script_run_config_arg = create_and_submit_experiment_patch.call_args[1]["script_run_config"]
+ assert isinstance(script_run_config_arg, HyperDriveConfig)
diff --git a/Tests/ML/test_download_upload.py b/Tests/ML/test_download_upload.py
index b5f4b1f0d..c7d6884ce 100644
--- a/Tests/ML/test_download_upload.py
+++ b/Tests/ML/test_download_upload.py
@@ -25,6 +25,7 @@
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.configs.lightning_test_containers import DummyContainerWithDatasets
from Tests.ML.util import get_default_azure_config
+from health.azure.himl import AzureRunInfo
logging_to_stdout(logging.DEBUG)
@@ -89,26 +90,27 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
# creation may need access to the dataset.
with pytest.raises(ValueError) as ex:
runner.setup()
- assert ex.value.args[0] == "The model must contain either local_dataset or azure_dataset_id."
+ assert ex.value.args[0] == "Expecting that a dataset is available here."
runner.project_root = test_output_dirs.root_dir
# Pointing the model to a dataset folder that does not exist should raise an Exception
fake_folder = runner.project_root / "foo"
runner.container.local_dataset = fake_folder
with pytest.raises(FileNotFoundError):
- runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
+ runner.download_or_use_existing_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
# If the local dataset folder exists, mount_or_download_dataset should not do anything.
fake_folder.mkdir()
- local_dataset = runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
+ local_dataset = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
+ runner.container.local_dataset)
assert local_dataset == fake_folder
# Pointing the model to a dataset in Azure should trigger a download
runner.container.local_dataset = None
runner.container.azure_dataset_id = dataset_name
with logging_section("Starting download"):
- result_path = runner.mount_or_download_dataset(runner.container.azure_dataset_id,
- runner.container.local_dataset)
+ result_path = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
+ runner.container.local_dataset)
# Download goes into / "datasets" / "test_dataset"
expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name
assert result_path == expected_path
@@ -156,17 +158,23 @@ def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests,
with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run):
with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path):
- with mock.patch("InnerEye.ML.run_ml.try_to_mount_input_dataset", return_value=mount_path):
- runner = MLRunner(config, container=container,
- azure_config=None, project_root=test_output_dirs.root_dir)
- runner.setup()
- return runner.container
+ runner = MLRunner(config, container=container,
+ azure_config=None, project_root=test_output_dirs.root_dir)
+ path_from_aml: List[Optional[Path]] = [None] if is_offline_run else [mount_path]
+ runner.setup(azure_run_info=AzureRunInfo(input_datasets=path_from_aml,
+ output_datasets=[],
+ run=None,
+ is_running_in_azure=False,
+ output_folder=Path(),
+ logs_folder=Path()
+ ))
+ return runner.container
@pytest.mark.parametrize(("is_lightning_model", "expected_error"),
[
# A built-in InnerEye model must have either local dataset or azure dataset provided.
- (False, "The model must contain either local_dataset or azure_dataset_id"),
+ (False, "Expecting that a dataset is available here."),
# ... but this is OK for Lightning container models. A Lightning container could simply
# download its data from the web before training.
(True, "")
diff --git a/Tests/ML/test_lightning_containers.py b/Tests/ML/test_lightning_containers.py
index 024d41890..162ab2920 100644
--- a/Tests/ML/test_lightning_containers.py
+++ b/Tests/ML/test_lightning_containers.py
@@ -9,9 +9,9 @@
import pandas as pd
import pytest
-from pytorch_lightning import LightningModule
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
+from pytorch_lightning import LightningModule
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import ModelExecutionMode
@@ -21,9 +21,10 @@
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.run_ml import MLRunner
from Tests.ML.configs.DummyModel import DummyModel
-from Tests.ML.configs.lightning_test_containers import DummyContainerWithAzureDataset, DummyContainerWithHooks, DummyContainerWithModel, \
- DummyContainerWithPlainLightning
+from Tests.ML.configs.lightning_test_containers import (DummyContainerWithAzureDataset, DummyContainerWithHooks,
+ DummyContainerWithModel, DummyContainerWithPlainLightning)
from Tests.ML.util import default_runner
+from health.azure.himl import AzureRunInfo
def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
@@ -36,8 +37,7 @@ def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
args = ["", "--model=DummyContainerWithModel", "--model_configs_namespace=Tests.ML.configs",
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
with mock.patch("sys.argv", args):
- loaded_config, actual_run = runner.run()
- assert actual_run is None
+ runner.run()
assert isinstance(runner.lightning_container, DummyContainerWithModel)
# Test if the outputs folder is relative to the folder that we specified via the commandline
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
@@ -81,8 +81,7 @@ def test_run_container_with_plain_lightning_in_situ(test_output_dirs: OutputFold
args = ["", "--model=DummyContainerWithPlainLightning", "--model_configs_namespace=Tests.ML.configs",
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
with mock.patch("sys.argv", args):
- loaded_config, actual_run = runner.run()
- assert actual_run is None
+ runner.run()
assert isinstance(runner.lightning_container, DummyContainerWithPlainLightning)
# Test if the outputs folder is relative to the folder that we specified via the commandline
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
@@ -143,8 +142,8 @@ def test_run_fastmri_container(test_output_dirs: OutputFolderForTests) -> None:
f"--output_to={test_output_dirs.root_dir}",
"--model_configs_namespace=Tests.ML.configs"]
with mock.patch("sys.argv", args):
- loaded_config, actual_run = runner.run()
- assert actual_run is None
+ loaded_config, run_info = runner.run()
+ assert isinstance(run_info, AzureRunInfo)
from Tests.ML.configs.fastmri_random import FastMriOnRandomData
assert isinstance(runner.lightning_container, FastMriOnRandomData)
diff --git a/Tests/ML/test_regression_tests.py b/Tests/ML/test_regression_tests.py
index 55db89b2d..96dea3c4c 100644
--- a/Tests/ML/test_regression_tests.py
+++ b/Tests/ML/test_regression_tests.py
@@ -39,7 +39,7 @@ def test_regression_test(test_output_dirs: OutputFolderForTests) -> None:
container.local_dataset = test_output_dirs.root_dir
container.regression_test_folder = Path(str(uuid.uuid4().hex))
runner = MLRunner(container=container)
- runner.setup(use_mount_or_download_dataset=False)
+ runner.setup()
with pytest.raises(ValueError) as ex:
runner.run()
assert "Folder with expected files does not exist" in str(ex)
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index e0869d7e4..40ebf7a61 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -279,7 +279,7 @@ def model_train_unittest(config: Optional[DeepLearningConfig],
# It will also set random seeds correctly. Later we use so initialized container.
# For all tests running in AzureML, we need to skip the downloading of datasets that would otherwise happen,
# because all unit test configs come with their own local dataset already.
- runner.setup(use_mount_or_download_dataset=False)
+ runner.setup()
if checkpoint_handler is None:
azure_config = get_default_azure_config()
checkpoint_handler = CheckpointHandler(azure_config=azure_config,
diff --git a/azure-pipelines/build.yaml b/azure-pipelines/build.yaml
index 482c1cb51..370a32f86 100644
--- a/azure-pipelines/build.yaml
+++ b/azure-pipelines/build.yaml
@@ -33,7 +33,8 @@ steps:
displayName: Install InnerEye (Dev) Package
# First run all tests that require the InnerEye package. All code should be consumed via the InnerEye package,
- # hence don't set PYTHONPATH
+ # hence don't set PYTHONPATH to InnerEye - but do set it to hi-ml if that has been included as a submodule for dev
+ # work on the package
- bash: |
source activate InnerEye
pytest ./Tests/ -m "not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)" --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-config=.coveragerc --cov-report=xml -n 2 --dist=loadscope --verbose
@@ -49,7 +50,6 @@ steps:
source activate InnerEye
pytest ./TestsOutsidePackage -n 2 --dist=loadscope --verbose --junitxml=junit/test-outside-package-results.xml
env:
- PYTHONPATH: $(Build.SourcesDirectory)/
APPLICATION_KEY: $(InnerEyeDeepLearningServicePrincipalKey)
failOnStderr: false
condition: succeededOrFailed()
diff --git a/conftest.py b/conftest.py
index 0ce24354c..bed8e7e95 100644
--- a/conftest.py
+++ b/conftest.py
@@ -13,8 +13,12 @@
import pytest
-from InnerEye.Common.output_directories import OutputFolderForTests, remove_and_create_folder
+from InnerEye.Common.fixed_paths import add_submodules_to_path
from InnerEye.Common.fixed_paths_for_tests import TEST_OUTPUTS_PATH
+from InnerEye.Common.output_directories import OutputFolderForTests, remove_and_create_folder
+
+# This needs to be right at the start of conftest, so that already test collection has access to all submodules
+add_submodules_to_path()
@pytest.fixture(autouse=True, scope='session')
diff --git a/docs/contributing.md b/docs/contributing.md
index e71958a3a..f821cd291 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -13,7 +13,20 @@ This document describes guidelines for contributing to the InnerEye-DeepLearning
- DO NOT mix independent and unrelated changes in one PR.
## Coding Style
-The coding style is enforced via flake8 and mypy
+The coding style is enforced via `flake8` and `mypy`. Before pushing any changes to a PR, run both tools on
+your dev box:
+* `flake8`
+* `python mypy_runner.py`
+
+## Unit testing
+- DO write unit tests for each new function or class that you add.
+- DO extend unit tests for existing functions or classes if you change their core behaviour.
+- DO ensure that your tests are designed in a way that they can pass on the local box, even if they are relying on
+specific cloud features.
+- DO run all unit tests on your dev box before submitting your changes. The test suite is designed to pass completely
+also outside of cloud builds.
+- DO NOT rely only on the test builds in the cloud. Cloud builds trigger AzureML runs on GPU
+machines that have a higher CO2 footprint than your dev box.
## Creating Issues
- DO use a descriptive title that identifies the issue or the requested feature.
diff --git a/docs/environment.md b/docs/environment.md
index afce4c595..397518357 100644
--- a/docs/environment.md
+++ b/docs/environment.md
@@ -188,3 +188,36 @@ You can do this, for example, to force an update of the `azureml-sdk` and all it
If you want to take the second route:
1. Use `conda env update -f environment.yml --prune` to refresh if you make changes in environment.yml
1. To update packages use `conda update --all` and `pip-review --local --interactive`
+
+
+## Using the hi-ml package
+
+To work on `hi-ml` package at the same time as `InnerEye-DeepLearning`, it can help to add the `hi-ml` package
+as a submodule, rather than a package from pypi. Any change to the package will require a full new docker image build,
+and that costs 20min per run.
+
+* In the repository root, run `git submodule add https://github.com/microsoft/hi-ml`
+* In PyCharm's project browser, mark the folder `hi-ml/src` as Sources Root
+* Remove the entry for the `hi-ml` package from `environment.yml`
+* Modify the start of `InnerEye/ML/runner.py` to look like this:
+```python
+print(f"Starting InnerEye runner at {sys.argv[0]}")
+innereye_root = Path(__file__).absolute().parent.parent.parent
+if (innereye_root / "InnerEye").is_dir():
+ innereye_root_str = str(innereye_root)
+ if innereye_root_str not in sys.path:
+ print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
+ sys.path.insert(0, innereye_root_str)
+ sys.path.append(str(innereye_root / "hi-ml" / "src"))
+```
+
+Alternatively, you can consume a developer version of `hi-ml` from `test.pypi`:
+* Remove the entry for the `hi-ml` package from `environment.yml`
+* Add a section like this to `environment.yml`, to point pip to `test.pypi`, and a specific version of th package:
+```
+ ...
+ - pip:
+ - --extra-index-url https://test.pypi.org/simple/
+ - hi-ml==0.1.0.post236
+ ...
+```
diff --git a/environment.yml b/environment.yml
index 91ffdf710..97b04bd58 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,9 +12,9 @@ dependencies:
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
- azure-mgmt-resource==12.1.0
- azure-mgmt-datafactory==1.1.0
- - azureml-mlflow==1.23.0
- - azureml-sdk==1.23.0
- - azureml-tensorboard==1.23.0
+ - azureml-mlflow==1.32.0
+ - azureml-sdk==1.32.0
+ - azureml-tensorboard==1.32.0
- conda-merge==0.1.5
- cryptography==3.3.2
- dataclasses-json==0.5.2
@@ -23,6 +23,7 @@ dependencies:
- gitpython==3.1.7
- gputil==1.4.0
- h5py==2.10.0
+ - hi-ml==0.1.2
- InnerEye-DICOM-RT==1.0.1
- joblib==0.16.0
- jupyter==1.0.0
@@ -30,7 +31,7 @@ dependencies:
- lightning-bolts==0.3.4
- matplotlib==3.3.0
- mlflow==1.17.0
- - mypy==0.812
+ - mypy==0.910
- mypy-extensions==0.4.3
- numba==0.51.2
- numpy==1.19.1
diff --git a/mypy_runner.py b/mypy_runner.py
index e24279477..3ecbdac81 100644
--- a/mypy_runner.py
+++ b/mypy_runner.py
@@ -37,7 +37,7 @@ def run_mypy(files: List[str], mypy_executable_path: str) -> int:
else:
print("Skipping.")
if mypy_args:
- command = [mypy_executable_path, "--config=mypy.ini", *mypy_args]
+ command = [mypy_executable_path, "--install-types", "--non-interactive", "--config=mypy.ini", *mypy_args]
# We pipe stdout and then print it, otherwise lines can appear in the wrong order in builds.
process = subprocess.run(command)
return_code = max(return_code, process.returncode)