Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit cd0c685

Browse files
authored
Bug fix: When using local folders, datasets are downloaded nevertheless (#604)
1 parent 5a37198 commit cd0c685

15 files changed

+274
-368
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ in inference-only runs when using lightning containers.
9090
correctly in the SimCLR module
9191
- ([#558](https://github.com/microsoft/InnerEye-DeepLearning/pull/558)) Fix issue with the CovidModel config where model
9292
weights from a finetuning run were incompatible with the model architecture created for non-finetuning runs.
93+
- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Fix issue where runs on a VM would download the dataset even when a local dataset is provided.
9394

9495
### Removed
9596

@@ -104,6 +105,7 @@ in inference-only runs when using lightning containers.
104105
- ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Removed cryptography from list of invalid
105106
packages in `test_invalid_python_packages` as it is already present as a dependency in our conda environment.
106107
- ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Removed obsolete `TrainGlaucomaCV` from PR build.
108+
- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Removed all code that downloads datasets, this is now all handled by hi-ml
107109

108110
### Deprecated
109111

InnerEye/Azure/azure_runner.py

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@
1212
from pathlib import Path
1313
from typing import Any, Dict, List, Optional
1414

15+
from health_azure import DatasetConfig
16+
1517
from InnerEye.Azure.azure_config import AzureConfig, ParserResult
1618
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME
1719
from InnerEye.Azure.secrets_handling import read_all_settings
1820
from InnerEye.Common.generic_parsing import GenericConfig
1921
from InnerEye.ML.common import ModelExecutionMode
2022
from InnerEye.ML.utils.config_loader import ModelConfigLoader
21-
from health_azure import DatasetConfig
2223

2324
SLEEP_TIME_SECONDS = 30
2425

@@ -91,34 +92,56 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
9192

9293
def create_dataset_configs(azure_config: AzureConfig,
9394
all_azure_dataset_ids: List[str],
94-
all_dataset_mountpoints: List[str]) -> List[DatasetConfig]:
95+
all_dataset_mountpoints: List[str],
96+
all_local_datasets: List[Optional[Path]]) -> List[DatasetConfig]:
9597
"""
96-
Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be
97-
skipped.
98+
Sets up all the dataset consumption objects for the datasets provided. The returned list will have the same length
99+
as there are non-empty azure dataset IDs.
100+
101+
Valid arguments combinations:
102+
N azure datasets, 0 or N mount points, 0 or N local datasets
103+
98104
:param azure_config: azure related configurations to use for model scale-out behaviour
99105
:param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
100106
:param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
107+
:param all_local_datasets: The paths for all local versions of the datasets.
101108
:return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids,
102109
omitting datasets with an empty name.
103110
"""
104111
datasets: List[DatasetConfig] = []
105-
if len(all_dataset_mountpoints) > 0:
106-
if len(all_azure_dataset_ids) != len(all_dataset_mountpoints):
107-
raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) "
108-
f"must equal the number of Azure dataset IDs ({len(all_azure_dataset_ids)})")
112+
num_local = len(all_local_datasets)
113+
num_azure = len(all_azure_dataset_ids)
114+
num_mount = len(all_dataset_mountpoints)
115+
if num_azure > 0 and (num_local == 0 or num_local == num_azure) and (num_mount == 0 or num_mount == num_azure):
116+
# Test for valid settings: If we have N azure datasets, the local datasets and mount points need to either
117+
# have exactly the same length, or 0. In the latter case, empty mount points and no local dataset will be
118+
# assumed below.
119+
count = num_azure
120+
elif num_azure == 0 and num_mount == 0:
121+
# No datasets in Azure at all: This is possible for runs that for example download their own data from the web.
122+
# There can be any number of local datasets, but we are not checking that. In MLRunner.setup, there is a check
123+
# that leaves local datasets intact if there are no Azure datasets.
124+
return []
109125
else:
110-
all_dataset_mountpoints = [""] * len(all_azure_dataset_ids)
111-
for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)):
112-
if dataset_id:
113-
datasets.append(DatasetConfig(name=dataset_id,
114-
# Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs,
115-
# setting to None works.
116-
target_folder=mount_point or None,
117-
use_mounting=azure_config.use_dataset_mount,
118-
datastore=azure_config.azureml_datastore))
119-
elif mount_point:
120-
raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has "
121-
f"been provided ('{mount_point}')")
126+
raise ValueError("Invalid dataset setup. You need to specify N entries in azure_datasets and a matching "
127+
"number of local_datasets and dataset_mountpoints")
128+
for i in range(count):
129+
azure_dataset = all_azure_dataset_ids[i] if i < num_azure else ""
130+
if not azure_dataset:
131+
continue
132+
mount_point = all_dataset_mountpoints[i] if i < num_mount else ""
133+
local_dataset = all_local_datasets[i] if i < num_local else None
134+
is_empty_azure_dataset = len(azure_dataset.strip()) == 0
135+
config = DatasetConfig(name=azure_dataset,
136+
# Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs,
137+
# setting to None works.
138+
target_folder=mount_point or None,
139+
local_folder=local_dataset,
140+
use_mounting=azure_config.use_dataset_mount,
141+
datastore=azure_config.azureml_datastore)
142+
if is_empty_azure_dataset:
143+
config.name = ""
144+
datasets.append(config)
122145
return datasets
123146

124147

InnerEye/Common/fixed_paths.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,6 @@ def repository_root_directory(path: Optional[PathOrString] = None) -> Path:
3737
DEFAULT_LOGS_DIR_NAME = "logs"
3838

3939
DEFAULT_MODEL_SUMMARIES_DIR_PATH = Path(DEFAULT_LOGS_DIR_NAME) / "model_summaries"
40-
# The folder at the project root directory that holds datasets for local execution.
41-
DATASETS_DIR_NAME = "datasets"
4240

4341
ML_RELATIVE_SOURCE_PATH = os.path.join("ML")
4442
ML_RELATIVE_RUNNER_PATH = os.path.join(ML_RELATIVE_SOURCE_PATH, "runner.py")

InnerEye/Common/generic_parsing.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import argparse
88
import logging
9+
from pathlib import Path
910
from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
1011

1112
import param
@@ -32,6 +33,50 @@ def _validate(self, val: Any) -> None:
3233
super()._validate(val)
3334

3435

36+
class StringOrStringList(param.Parameter):
37+
"""
38+
Wrapper class to allow either a string or a list of strings. Internally represented always as a list.
39+
"""
40+
41+
def _validate(self, val: Any) -> None:
42+
if isinstance(val, str):
43+
return
44+
if isinstance(val, List):
45+
if all([isinstance(v, str) for v in val]):
46+
return
47+
raise ValueError(f"{val} must be a string or a list of strings")
48+
49+
def set_hook(self, obj: Any, val: Any) -> Any:
50+
"""
51+
Modifies the value before calling the setter. Here, we are converting all strings to lists of strings.
52+
"""
53+
if isinstance(val, str):
54+
return [val]
55+
return val
56+
57+
58+
class PathOrPathList(param.Parameter):
59+
"""
60+
Wrapper class to allow either a Path or a list of Paths. Internally represented always as a list.
61+
"""
62+
63+
def _validate(self, val: Any) -> None:
64+
if isinstance(val, Path):
65+
return
66+
if isinstance(val, List):
67+
if all([isinstance(v, Path) for v in val]):
68+
return
69+
raise ValueError(f"{val} must be a Path object or a list of paths")
70+
71+
def set_hook(self, obj: Any, val: Any) -> Any:
72+
"""
73+
Modifies the value before calling the setter. Here, we are converting simple path to lists of path.
74+
"""
75+
if isinstance(val, Path):
76+
return [val]
77+
return val
78+
79+
3580
class IntTuple(param.NumericTuple):
3681
"""
3782
Parameter class that must always have integer values

InnerEye/ML/deep_learning_config.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from InnerEye.Common.common_util import ModelProcessing, is_windows
1919
from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR_NAME
2020
from InnerEye.Common.generic_parsing import GenericConfig
21-
from InnerEye.Common.type_annotations import PathOrString, TupleFloat2
21+
from InnerEye.Common.type_annotations import PathOrString, T, TupleFloat2
2222
from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, create_unique_timestamp_id, \
2323
get_best_checkpoint_path, get_recovery_checkpoint_path
2424

@@ -369,10 +369,10 @@ class DatasetParams(param.Parameterized):
369369
param.List(default=[], allow_None=False,
370370
doc="This can be used to feed in additional datasets to your custom datamodules. These will be"
371371
"mounted and made available as a list of paths in 'extra_local_datasets' when running in AML.")
372-
extra_local_dataset_paths: List[Path] = param.List(class_=Path, default=[], allow_None=False,
373-
doc="This can be used to feed in additional datasets "
374-
"to your custom datamodules when running outside of Azure "
375-
"AML.")
372+
extra_local_dataset_paths: List[Optional[Path]] = \
373+
param.List(class_=Path, default=[], allow_None=False,
374+
doc="This can be used to feed in additional datasets "
375+
"to your custom datamodules when running outside of Azure AML.")
376376
dataset_mountpoint: str = param.String(doc="The path at which the AzureML dataset should be made available via "
377377
"mounting or downloading. This only affects jobs running in AzureML."
378378
"If empty, use a random mount/download point.")
@@ -396,20 +396,29 @@ def all_azure_dataset_ids(self) -> List[str]:
396396
Returns a list with all azure dataset IDs that are specified in self.azure_dataset_id and
397397
self.extra_azure_dataset_ids
398398
"""
399-
if not self.azure_dataset_id:
400-
return self.extra_azure_dataset_ids
401-
else:
402-
return [self.azure_dataset_id] + self.extra_azure_dataset_ids
399+
return self._concat_paths(self.azure_dataset_id, self.extra_azure_dataset_ids)
403400

404401
def all_dataset_mountpoints(self) -> List[str]:
405402
"""
406403
Returns a list with all dataset mount points that are specified in self.dataset_mountpoint and
407404
self.extra_dataset_mountpoints
408405
"""
409-
if not self.dataset_mountpoint:
410-
return self.extra_dataset_mountpoints
411-
else:
412-
return [self.dataset_mountpoint] + self.extra_dataset_mountpoints
406+
return self._concat_paths(self.dataset_mountpoint, self.extra_dataset_mountpoints)
407+
408+
def all_local_dataset_paths(self) -> List[Path]:
409+
"""
410+
Returns a list with all dataset mount points that are specified in self.local_dataset and
411+
self.extra_local_dataset_paths
412+
"""
413+
return self._concat_paths(self.local_dataset, self.extra_local_dataset_paths) # type: ignore
414+
415+
def _concat_paths(self, item: Optional[T], items: List[T]) -> List[T]:
416+
"""
417+
Creates a list with the item going first (if it does not evaluate to False), then the rest of the items.
418+
"""
419+
if item is None or (isinstance(item, str) and not item.strip()):
420+
return items
421+
return [item] + items
413422

414423

415424
class OutputParams(param.Parameterized):

InnerEye/ML/model_training.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def create_lightning_trainer(container: LightningContainer,
9191
:param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
9292
:return: A tuple [Trainer object, diagnostic logger]
9393
"""
94+
logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
9495
num_gpus = container.num_gpus_per_node()
9596
effective_num_gpus = num_gpus * num_nodes
9697
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).

InnerEye/ML/normalize_and_visualize_dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources
2121
from InnerEye.ML.deep_learning_config import ARGS_TXT
2222
from InnerEye.ML.photometric_normalization import PhotometricNormalization
23-
from InnerEye.ML.run_ml import MLRunner
2423
from InnerEye.ML.utils.config_loader import ModelConfigLoader
2524
from InnerEye.ML.utils.io_util import load_images_from_dataset_source
25+
from health_azure import DatasetConfig
2626

2727

2828
class NormalizeAndVisualizeConfig(GenericConfig):
@@ -73,8 +73,10 @@ def main(yaml_file_path: Path) -> None:
7373
In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
7474
"""
7575
config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
76-
runner = MLRunner(config, azure_config=runner_config)
77-
local_dataset = runner.download_or_use_existing_dataset(config.azure_dataset_id, config.local_dataset)
76+
dataset_config = DatasetConfig(name=config.azure_dataset_id,
77+
local_folder=config.local_dataset,
78+
use_mounting=True)
79+
local_dataset, mount_context = dataset_config.to_input_dataset_local(workspace=runner_config.get_workspace())
7880
dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
7981
normalizer_config = NormalizeAndVisualizeConfig(**args)
8082
actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id

0 commit comments

Comments
 (0)