microsoft · ant0nsc · Dec 7, 2021 · Dec 2, 2021 · Dec 2, 2021 · Dec 6, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -90,6 +90,7 @@ in inference-only runs when using lightning containers.
   correctly in the SimCLR module
 - ([#558](https://github.com/microsoft/InnerEye-DeepLearning/pull/558)) Fix issue with the CovidModel config where model
   weights from a finetuning run were incompatible with the model architecture created for non-finetuning runs.
+- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Fix issue where runs on a VM would download the dataset even when a local dataset is provided.
 
 ### Removed
 
@@ -104,6 +105,7 @@ in inference-only runs when using lightning containers.
 - ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Removed cryptography from list of invalid
   packages in `test_invalid_python_packages` as it is already present as a dependency in our conda environment.
 - ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Removed obsolete `TrainGlaucomaCV` from PR build.
+- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Removed all code that downloads datasets, this is now all handled by hi-ml
 
 ### Deprecated
 

diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py
@@ -12,13 +12,14 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
+from health_azure import DatasetConfig
+
 from InnerEye.Azure.azure_config import AzureConfig, ParserResult
 from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME
 from InnerEye.Azure.secrets_handling import read_all_settings
 from InnerEye.Common.generic_parsing import GenericConfig
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
-from health_azure import DatasetConfig
 
 SLEEP_TIME_SECONDS = 30
 
@@ -91,34 +92,56 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
 
 def create_dataset_configs(azure_config: AzureConfig,
                            all_azure_dataset_ids: List[str],
-                           all_dataset_mountpoints: List[str]) -> List[DatasetConfig]:
+                           all_dataset_mountpoints: List[str],
+                           all_local_datasets: List[Optional[Path]]) -> List[DatasetConfig]:
     """
-    Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be
-    skipped.
+    Sets up all the dataset consumption objects for the datasets provided. The returned list will have the same length
+    as there are non-empty azure dataset IDs.
+
+    Valid arguments combinations:
+    N azure datasets, 0 or N mount points, 0 or N local datasets
+
     :param azure_config: azure related configurations to use for model scale-out behaviour
     :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
     :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
+    :param all_local_datasets: The paths for all local versions of the datasets.
     :return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids,
     omitting datasets with an empty name.
     """
     datasets: List[DatasetConfig] = []
-    if len(all_dataset_mountpoints) > 0:
-        if len(all_azure_dataset_ids) != len(all_dataset_mountpoints):
-            raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) "
-                             f"must equal the number of Azure dataset IDs ({len(all_azure_dataset_ids)})")
+    num_local = len(all_local_datasets)
+    num_azure = len(all_azure_dataset_ids)
+    num_mount = len(all_dataset_mountpoints)
+    if num_azure > 0 and (num_local == 0 or num_local == num_azure) and (num_mount == 0 or num_mount == num_azure):
+        # Test for valid settings: If we have N azure datasets, the local datasets and mount points need to either
+        # have exactly the same length, or 0. In the latter case, empty mount points and no local dataset will be
+        # assumed below.
+        count = num_azure
+    elif num_azure == 0 and num_mount == 0:
+        # No datasets in Azure at all: This is possible for runs that for example download their own data from the web.
+        # There can be any number of local datasets, but we are not checking that. In MLRunner.setup, there is a check
+        # that leaves local datasets intact if there are no Azure datasets.
+        return []
     else:
-        all_dataset_mountpoints = [""] * len(all_azure_dataset_ids)
-    for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)):
-        if dataset_id:
-            datasets.append(DatasetConfig(name=dataset_id,
-                                          # Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs,
-                                          # setting to None works.
-                                          target_folder=mount_point or None,
-                                          use_mounting=azure_config.use_dataset_mount,
-                                          datastore=azure_config.azureml_datastore))
-        elif mount_point:
-            raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has "
-                             f"been provided ('{mount_point}')")
+        raise ValueError("Invalid dataset setup. You need to specify N entries in azure_datasets and a matching "
+                         "number of local_datasets and dataset_mountpoints")
+    for i in range(count):
+        azure_dataset = all_azure_dataset_ids[i] if i < num_azure else ""
+        if not azure_dataset:
+            continue
+        mount_point = all_dataset_mountpoints[i] if i < num_mount else ""
+        local_dataset = all_local_datasets[i] if i < num_local else None
+        is_empty_azure_dataset = len(azure_dataset.strip()) == 0
+        config = DatasetConfig(name=azure_dataset,
+                               # Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs,
+                               # setting to None works.
+                               target_folder=mount_point or None,
+                               local_folder=local_dataset,
+                               use_mounting=azure_config.use_dataset_mount,
+                               datastore=azure_config.azureml_datastore)
+        if is_empty_azure_dataset:
+            config.name = ""
+        datasets.append(config)
     return datasets
 
 

diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py
@@ -37,8 +37,6 @@ def repository_root_directory(path: Optional[PathOrString] = None) -> Path:
 DEFAULT_LOGS_DIR_NAME = "logs"
 
 DEFAULT_MODEL_SUMMARIES_DIR_PATH = Path(DEFAULT_LOGS_DIR_NAME) / "model_summaries"
-# The folder at the project root directory that holds datasets for local execution.
-DATASETS_DIR_NAME = "datasets"
 
 ML_RELATIVE_SOURCE_PATH = os.path.join("ML")
 ML_RELATIVE_RUNNER_PATH = os.path.join(ML_RELATIVE_SOURCE_PATH, "runner.py")

diff --git a/InnerEye/Common/generic_parsing.py b/InnerEye/Common/generic_parsing.py
@@ -6,6 +6,7 @@
 
 import argparse
 import logging
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
 
 import param
@@ -32,6 +33,50 @@ def _validate(self, val: Any) -> None:
         super()._validate(val)
 
 
+class StringOrStringList(param.Parameter):
+    """
+    Wrapper class to allow either a string or a list of strings. Internally represented always as a list.
+    """
+
+    def _validate(self, val: Any) -> None:
+        if isinstance(val, str):
+            return
+        if isinstance(val, List):
+            if all([isinstance(v, str) for v in val]):
+                return
+        raise ValueError(f"{val} must be a string or a list of strings")
+
+    def set_hook(self, obj: Any, val: Any) -> Any:
+        """
+        Modifies the value before calling the setter. Here, we are converting all strings to lists of strings.
+        """
+        if isinstance(val, str):
+            return [val]
+        return val
+
+
+class PathOrPathList(param.Parameter):
+    """
+    Wrapper class to allow either a Path or a list of Paths. Internally represented always as a list.
+    """
+
+    def _validate(self, val: Any) -> None:
+        if isinstance(val, Path):
+            return
+        if isinstance(val, List):
+            if all([isinstance(v, Path) for v in val]):
+                return
+        raise ValueError(f"{val} must be a Path object or a list of paths")
+
+    def set_hook(self, obj: Any, val: Any) -> Any:
+        """
+        Modifies the value before calling the setter. Here, we are converting simple path to lists of path.
+        """
+        if isinstance(val, Path):
+            return [val]
+        return val
+
+
 class IntTuple(param.NumericTuple):
     """
     Parameter class that must always have integer values

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -18,7 +18,7 @@
 from InnerEye.Common.common_util import ModelProcessing, is_windows
 from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR_NAME
 from InnerEye.Common.generic_parsing import GenericConfig
-from InnerEye.Common.type_annotations import PathOrString, TupleFloat2
+from InnerEye.Common.type_annotations import PathOrString, T, TupleFloat2
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, create_unique_timestamp_id, \
     get_best_checkpoint_path, get_recovery_checkpoint_path
 
@@ -369,10 +369,10 @@ class DatasetParams(param.Parameterized):
         param.List(default=[], allow_None=False,
                    doc="This can be used to feed in additional datasets to your custom datamodules. These will be"
                        "mounted and made available as a list of paths in 'extra_local_datasets' when running in AML.")
-    extra_local_dataset_paths: List[Path] = param.List(class_=Path, default=[], allow_None=False,
-                                                       doc="This can be used to feed in additional datasets "
-                                                           "to your custom datamodules when running outside of Azure "
-                                                           "AML.")
+    extra_local_dataset_paths: List[Optional[Path]] = \
+        param.List(class_=Path, default=[], allow_None=False,
+                   doc="This can be used to feed in additional datasets "
+                       "to your custom datamodules when running outside of Azure AML.")
     dataset_mountpoint: str = param.String(doc="The path at which the AzureML dataset should be made available via "
                                                "mounting or downloading. This only affects jobs running in AzureML."
                                                "If empty, use a random mount/download point.")
@@ -396,20 +396,29 @@ def all_azure_dataset_ids(self) -> List[str]:
         Returns a list with all azure dataset IDs that are specified in self.azure_dataset_id and
         self.extra_azure_dataset_ids
         """
-        if not self.azure_dataset_id:
-            return self.extra_azure_dataset_ids
-        else:
-            return [self.azure_dataset_id] + self.extra_azure_dataset_ids
+        return self._concat_paths(self.azure_dataset_id, self.extra_azure_dataset_ids)
 
     def all_dataset_mountpoints(self) -> List[str]:
         """
         Returns a list with all dataset mount points that are specified in self.dataset_mountpoint and
         self.extra_dataset_mountpoints
         """
-        if not self.dataset_mountpoint:
-            return self.extra_dataset_mountpoints
-        else:
-            return [self.dataset_mountpoint] + self.extra_dataset_mountpoints
+        return self._concat_paths(self.dataset_mountpoint, self.extra_dataset_mountpoints)
+
+    def all_local_dataset_paths(self) -> List[Path]:
+        """
+        Returns a list with all dataset mount points that are specified in self.local_dataset and
+        self.extra_local_dataset_paths
+        """
+        return self._concat_paths(self.local_dataset, self.extra_local_dataset_paths)  # type: ignore
+
+    def _concat_paths(self, item: Optional[T], items: List[T]) -> List[T]:
+        """
+        Creates a list with the item going first (if it does not evaluate to False), then the rest of the items.
+        """
+        if item is None or (isinstance(item, str) and not item.strip()):
+            return items
+        return [item] + items
 
 
 class OutputParams(param.Parameterized):

diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
@@ -91,6 +91,7 @@ def create_lightning_trainer(container: LightningContainer,
     :param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
     :return: A tuple [Trainer object, diagnostic logger]
     """
+    logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
     num_gpus = container.num_gpus_per_node()
     effective_num_gpus = num_gpus * num_nodes
     # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).

diff --git a/InnerEye/ML/normalize_and_visualize_dataset.py b/InnerEye/ML/normalize_and_visualize_dataset.py
@@ -20,9 +20,9 @@
 from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources
 from InnerEye.ML.deep_learning_config import ARGS_TXT
 from InnerEye.ML.photometric_normalization import PhotometricNormalization
-from InnerEye.ML.run_ml import MLRunner
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
 from InnerEye.ML.utils.io_util import load_images_from_dataset_source
+from health_azure import DatasetConfig
 
 
 class NormalizeAndVisualizeConfig(GenericConfig):
@@ -73,8 +73,10 @@ def main(yaml_file_path: Path) -> None:
     In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
     """
     config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
-    runner = MLRunner(config, azure_config=runner_config)
-    local_dataset = runner.download_or_use_existing_dataset(config.azure_dataset_id, config.local_dataset)
+    dataset_config = DatasetConfig(name=config.azure_dataset_id,
+                                   local_folder=config.local_dataset,
+                                   use_mounting=True)
+    local_dataset, mount_context = dataset_config.to_input_dataset_local(workspace=runner_config.get_workspace())
     dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
     normalizer_config = NormalizeAndVisualizeConfig(**args)
     actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id