|
12 | 12 | from pathlib import Path |
13 | 13 | from typing import Any, Dict, List, Optional |
14 | 14 |
|
| 15 | +from health_azure import DatasetConfig |
| 16 | + |
15 | 17 | from InnerEye.Azure.azure_config import AzureConfig, ParserResult |
16 | 18 | from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME |
17 | 19 | from InnerEye.Azure.secrets_handling import read_all_settings |
18 | 20 | from InnerEye.Common.generic_parsing import GenericConfig |
19 | 21 | from InnerEye.ML.common import ModelExecutionMode |
20 | 22 | from InnerEye.ML.utils.config_loader import ModelConfigLoader |
21 | | -from health_azure import DatasetConfig |
22 | 23 |
|
23 | 24 | SLEEP_TIME_SECONDS = 30 |
24 | 25 |
|
@@ -91,34 +92,56 @@ def create_experiment_name(azure_config: AzureConfig) -> str: |
91 | 92 |
|
92 | 93 | def create_dataset_configs(azure_config: AzureConfig, |
93 | 94 | all_azure_dataset_ids: List[str], |
94 | | - all_dataset_mountpoints: List[str]) -> List[DatasetConfig]: |
| 95 | + all_dataset_mountpoints: List[str], |
| 96 | + all_local_datasets: List[Optional[Path]]) -> List[DatasetConfig]: |
95 | 97 | """ |
96 | | - Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be |
97 | | - skipped. |
| 98 | + Sets up all the dataset consumption objects for the datasets provided. The returned list will have the same length |
| 99 | + as there are non-empty azure dataset IDs. |
| 100 | +
|
| 101 | + Valid arguments combinations: |
| 102 | + N azure datasets, 0 or N mount points, 0 or N local datasets |
| 103 | +
|
98 | 104 | :param azure_config: azure related configurations to use for model scale-out behaviour |
99 | 105 | :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run. |
100 | 106 | :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points. |
| 107 | + :param all_local_datasets: The paths for all local versions of the datasets. |
101 | 108 | :return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids, |
102 | 109 | omitting datasets with an empty name. |
103 | 110 | """ |
104 | 111 | datasets: List[DatasetConfig] = [] |
105 | | - if len(all_dataset_mountpoints) > 0: |
106 | | - if len(all_azure_dataset_ids) != len(all_dataset_mountpoints): |
107 | | - raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) " |
108 | | - f"must equal the number of Azure dataset IDs ({len(all_azure_dataset_ids)})") |
| 112 | + num_local = len(all_local_datasets) |
| 113 | + num_azure = len(all_azure_dataset_ids) |
| 114 | + num_mount = len(all_dataset_mountpoints) |
| 115 | + if num_azure > 0 and (num_local == 0 or num_local == num_azure) and (num_mount == 0 or num_mount == num_azure): |
| 116 | + # Test for valid settings: If we have N azure datasets, the local datasets and mount points need to either |
| 117 | + # have exactly the same length, or 0. In the latter case, empty mount points and no local dataset will be |
| 118 | + # assumed below. |
| 119 | + count = num_azure |
| 120 | + elif num_azure == 0 and num_mount == 0: |
| 121 | + # No datasets in Azure at all: This is possible for runs that for example download their own data from the web. |
| 122 | + # There can be any number of local datasets, but we are not checking that. In MLRunner.setup, there is a check |
| 123 | + # that leaves local datasets intact if there are no Azure datasets. |
| 124 | + return [] |
109 | 125 | else: |
110 | | - all_dataset_mountpoints = [""] * len(all_azure_dataset_ids) |
111 | | - for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)): |
112 | | - if dataset_id: |
113 | | - datasets.append(DatasetConfig(name=dataset_id, |
114 | | - # Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs, |
115 | | - # setting to None works. |
116 | | - target_folder=mount_point or None, |
117 | | - use_mounting=azure_config.use_dataset_mount, |
118 | | - datastore=azure_config.azureml_datastore)) |
119 | | - elif mount_point: |
120 | | - raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has " |
121 | | - f"been provided ('{mount_point}')") |
| 126 | + raise ValueError("Invalid dataset setup. You need to specify N entries in azure_datasets and a matching " |
| 127 | + "number of local_datasets and dataset_mountpoints") |
| 128 | + for i in range(count): |
| 129 | + azure_dataset = all_azure_dataset_ids[i] if i < num_azure else "" |
| 130 | + if not azure_dataset: |
| 131 | + continue |
| 132 | + mount_point = all_dataset_mountpoints[i] if i < num_mount else "" |
| 133 | + local_dataset = all_local_datasets[i] if i < num_local else None |
| 134 | + is_empty_azure_dataset = len(azure_dataset.strip()) == 0 |
| 135 | + config = DatasetConfig(name=azure_dataset, |
| 136 | + # Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs, |
| 137 | + # setting to None works. |
| 138 | + target_folder=mount_point or None, |
| 139 | + local_folder=local_dataset, |
| 140 | + use_mounting=azure_config.use_dataset_mount, |
| 141 | + datastore=azure_config.azureml_datastore) |
| 142 | + if is_empty_azure_dataset: |
| 143 | + config.name = "" |
| 144 | + datasets.append(config) |
122 | 145 | return datasets |
123 | 146 |
|
124 | 147 |
|
|
0 commit comments