Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ in inference-only runs when using lightning containers.
correctly in the SimCLR module
- ([#558](https://github.com/microsoft/InnerEye-DeepLearning/pull/558)) Fix issue with the CovidModel config where model
weights from a finetuning run were incompatible with the model architecture created for non-finetuning runs.
- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Fix issue where runs on a VM would download the dataset even when a local dataset is provided.

### Removed

Expand All @@ -104,6 +105,7 @@ in inference-only runs when using lightning containers.
- ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Removed cryptography from list of invalid
packages in `test_invalid_python_packages` as it is already present as a dependency in our conda environment.
- ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Removed obsolete `TrainGlaucomaCV` from PR build.
- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Removed all code that downloads datasets, this is now all handled by hi-ml

### Deprecated

Expand Down
63 changes: 43 additions & 20 deletions InnerEye/Azure/azure_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
from pathlib import Path
from typing import Any, Dict, List, Optional

from health_azure import DatasetConfig

from InnerEye.Azure.azure_config import AzureConfig, ParserResult
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME
from InnerEye.Azure.secrets_handling import read_all_settings
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from health_azure import DatasetConfig

SLEEP_TIME_SECONDS = 30

Expand Down Expand Up @@ -91,34 +92,56 @@ def create_experiment_name(azure_config: AzureConfig) -> str:

def create_dataset_configs(azure_config: AzureConfig,
all_azure_dataset_ids: List[str],
all_dataset_mountpoints: List[str]) -> List[DatasetConfig]:
all_dataset_mountpoints: List[str],
all_local_datasets: List[Optional[Path]]) -> List[DatasetConfig]:
"""
Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be
skipped.
Sets up all the dataset consumption objects for the datasets provided. The returned list will have the same length
as there are non-empty azure dataset IDs.

Valid arguments combinations:
N azure datasets, 0 or N mount points, 0 or N local datasets

:param azure_config: azure related configurations to use for model scale-out behaviour
:param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
:param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
:param all_local_datasets: The paths for all local versions of the datasets.
:return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids,
omitting datasets with an empty name.
"""
datasets: List[DatasetConfig] = []
if len(all_dataset_mountpoints) > 0:
if len(all_azure_dataset_ids) != len(all_dataset_mountpoints):
raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) "
f"must equal the number of Azure dataset IDs ({len(all_azure_dataset_ids)})")
num_local = len(all_local_datasets)
num_azure = len(all_azure_dataset_ids)
num_mount = len(all_dataset_mountpoints)
if num_azure > 0 and (num_local == 0 or num_local == num_azure) and (num_mount == 0 or num_mount == num_azure):
# Test for valid settings: If we have N azure datasets, the local datasets and mount points need to either
# have exactly the same length, or 0. In the latter case, empty mount points and no local dataset will be
# assumed below.
count = num_azure
elif num_azure == 0 and num_mount == 0:
# No datasets in Azure at all: This is possible for runs that for example download their own data from the web.
# There can be any number of local datasets, but we are not checking that. In MLRunner.setup, there is a check
# that leaves local datasets intact if there are no Azure datasets.
return []
else:
all_dataset_mountpoints = [""] * len(all_azure_dataset_ids)
for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)):
if dataset_id:
datasets.append(DatasetConfig(name=dataset_id,
# Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs,
# setting to None works.
target_folder=mount_point or None,
use_mounting=azure_config.use_dataset_mount,
datastore=azure_config.azureml_datastore))
elif mount_point:
raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has "
f"been provided ('{mount_point}')")
raise ValueError("Invalid dataset setup. You need to specify N entries in azure_datasets and a matching "
"number of local_datasets and dataset_mountpoints")
for i in range(count):
azure_dataset = all_azure_dataset_ids[i] if i < num_azure else ""
if not azure_dataset:
continue
mount_point = all_dataset_mountpoints[i] if i < num_mount else ""
local_dataset = all_local_datasets[i] if i < num_local else None
is_empty_azure_dataset = len(azure_dataset.strip()) == 0
config = DatasetConfig(name=azure_dataset,
# Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs,
# setting to None works.
target_folder=mount_point or None,
local_folder=local_dataset,
use_mounting=azure_config.use_dataset_mount,
datastore=azure_config.azureml_datastore)
if is_empty_azure_dataset:
config.name = ""
datasets.append(config)
return datasets


Expand Down
2 changes: 0 additions & 2 deletions InnerEye/Common/fixed_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ def repository_root_directory(path: Optional[PathOrString] = None) -> Path:
DEFAULT_LOGS_DIR_NAME = "logs"

DEFAULT_MODEL_SUMMARIES_DIR_PATH = Path(DEFAULT_LOGS_DIR_NAME) / "model_summaries"
# The folder at the project root directory that holds datasets for local execution.
DATASETS_DIR_NAME = "datasets"

ML_RELATIVE_SOURCE_PATH = os.path.join("ML")
ML_RELATIVE_RUNNER_PATH = os.path.join(ML_RELATIVE_SOURCE_PATH, "runner.py")
Expand Down
45 changes: 45 additions & 0 deletions InnerEye/Common/generic_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import argparse
import logging
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Set, Type, Union

import param
Expand All @@ -32,6 +33,50 @@ def _validate(self, val: Any) -> None:
super()._validate(val)


class StringOrStringList(param.Parameter):
"""
Wrapper class to allow either a string or a list of strings. Internally represented always as a list.
"""

def _validate(self, val: Any) -> None:
if isinstance(val, str):
return
if isinstance(val, List):
if all([isinstance(v, str) for v in val]):
return
raise ValueError(f"{val} must be a string or a list of strings")

def set_hook(self, obj: Any, val: Any) -> Any:
"""
Modifies the value before calling the setter. Here, we are converting all strings to lists of strings.
"""
if isinstance(val, str):
return [val]
return val


class PathOrPathList(param.Parameter):
"""
Wrapper class to allow either a Path or a list of Paths. Internally represented always as a list.
"""

def _validate(self, val: Any) -> None:
if isinstance(val, Path):
return
if isinstance(val, List):
if all([isinstance(v, Path) for v in val]):
return
raise ValueError(f"{val} must be a Path object or a list of paths")

def set_hook(self, obj: Any, val: Any) -> Any:
"""
Modifies the value before calling the setter. Here, we are converting simple path to lists of path.
"""
if isinstance(val, Path):
return [val]
return val


class IntTuple(param.NumericTuple):
"""
Parameter class that must always have integer values
Expand Down
35 changes: 22 additions & 13 deletions InnerEye/ML/deep_learning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from InnerEye.Common.common_util import ModelProcessing, is_windows
from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR_NAME
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.Common.type_annotations import PathOrString, TupleFloat2
from InnerEye.Common.type_annotations import PathOrString, T, TupleFloat2
from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, create_unique_timestamp_id, \
get_best_checkpoint_path, get_recovery_checkpoint_path

Expand Down Expand Up @@ -369,10 +369,10 @@ class DatasetParams(param.Parameterized):
param.List(default=[], allow_None=False,
doc="This can be used to feed in additional datasets to your custom datamodules. These will be"
"mounted and made available as a list of paths in 'extra_local_datasets' when running in AML.")
extra_local_dataset_paths: List[Path] = param.List(class_=Path, default=[], allow_None=False,
doc="This can be used to feed in additional datasets "
"to your custom datamodules when running outside of Azure "
"AML.")
extra_local_dataset_paths: List[Optional[Path]] = \
param.List(class_=Path, default=[], allow_None=False,
doc="This can be used to feed in additional datasets "
"to your custom datamodules when running outside of Azure AML.")
dataset_mountpoint: str = param.String(doc="The path at which the AzureML dataset should be made available via "
"mounting or downloading. This only affects jobs running in AzureML."
"If empty, use a random mount/download point.")
Expand All @@ -396,20 +396,29 @@ def all_azure_dataset_ids(self) -> List[str]:
Returns a list with all azure dataset IDs that are specified in self.azure_dataset_id and
self.extra_azure_dataset_ids
"""
if not self.azure_dataset_id:
return self.extra_azure_dataset_ids
else:
return [self.azure_dataset_id] + self.extra_azure_dataset_ids
return self._concat_paths(self.azure_dataset_id, self.extra_azure_dataset_ids)

def all_dataset_mountpoints(self) -> List[str]:
"""
Returns a list with all dataset mount points that are specified in self.dataset_mountpoint and
self.extra_dataset_mountpoints
"""
if not self.dataset_mountpoint:
return self.extra_dataset_mountpoints
else:
return [self.dataset_mountpoint] + self.extra_dataset_mountpoints
return self._concat_paths(self.dataset_mountpoint, self.extra_dataset_mountpoints)

def all_local_dataset_paths(self) -> List[Path]:
"""
Returns a list with all dataset mount points that are specified in self.local_dataset and
self.extra_local_dataset_paths
"""
return self._concat_paths(self.local_dataset, self.extra_local_dataset_paths) # type: ignore

def _concat_paths(self, item: Optional[T], items: List[T]) -> List[T]:
"""
Creates a list with the item going first (if it does not evaluate to False), then the rest of the items.
"""
if item is None or (isinstance(item, str) and not item.strip()):
return items
return [item] + items


class OutputParams(param.Parameterized):
Expand Down
1 change: 1 addition & 0 deletions InnerEye/ML/model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def create_lightning_trainer(container: LightningContainer,
:param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
:return: A tuple [Trainer object, diagnostic logger]
"""
logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
num_gpus = container.num_gpus_per_node()
effective_num_gpus = num_gpus * num_nodes
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
Expand Down
8 changes: 5 additions & 3 deletions InnerEye/ML/normalize_and_visualize_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources
from InnerEye.ML.deep_learning_config import ARGS_TXT
from InnerEye.ML.photometric_normalization import PhotometricNormalization
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.io_util import load_images_from_dataset_source
from health_azure import DatasetConfig


class NormalizeAndVisualizeConfig(GenericConfig):
Expand Down Expand Up @@ -73,8 +73,10 @@ def main(yaml_file_path: Path) -> None:
In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
"""
config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
runner = MLRunner(config, azure_config=runner_config)
local_dataset = runner.download_or_use_existing_dataset(config.azure_dataset_id, config.local_dataset)
dataset_config = DatasetConfig(name=config.azure_dataset_id,
local_folder=config.local_dataset,
use_mounting=True)
local_dataset, mount_context = dataset_config.to_input_dataset_local(workspace=runner_config.get_workspace())
dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
normalizer_config = NormalizeAndVisualizeConfig(**args)
actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id
Expand Down
Loading