diff --git a/CHANGELOG.md b/CHANGELOG.md index 87a234850..74c6d9784 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,7 @@ in inference-only runs when using lightning containers. correctly in the SimCLR module - ([#558](https://github.com/microsoft/InnerEye-DeepLearning/pull/558)) Fix issue with the CovidModel config where model weights from a finetuning run were incompatible with the model architecture created for non-finetuning runs. +- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Fix issue where runs on a VM would download the dataset even when a local dataset is provided. ### Removed @@ -104,6 +105,7 @@ in inference-only runs when using lightning containers. - ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Removed cryptography from list of invalid packages in `test_invalid_python_packages` as it is already present as a dependency in our conda environment. - ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Removed obsolete `TrainGlaucomaCV` from PR build. +- ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Removed all code that downloads datasets, this is now all handled by hi-ml ### Deprecated diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py index b0330c117..e3cecfce4 100644 --- a/InnerEye/Azure/azure_runner.py +++ b/InnerEye/Azure/azure_runner.py @@ -12,13 +12,14 @@ from pathlib import Path from typing import Any, Dict, List, Optional +from health_azure import DatasetConfig + from InnerEye.Azure.azure_config import AzureConfig, ParserResult from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME from InnerEye.Azure.secrets_handling import read_all_settings from InnerEye.Common.generic_parsing import GenericConfig from InnerEye.ML.common import ModelExecutionMode from InnerEye.ML.utils.config_loader import ModelConfigLoader -from health_azure import DatasetConfig SLEEP_TIME_SECONDS = 30 @@ -91,34 +92,56 @@ def create_experiment_name(azure_config: AzureConfig) -> str: def create_dataset_configs(azure_config: AzureConfig, all_azure_dataset_ids: List[str], - all_dataset_mountpoints: List[str]) -> List[DatasetConfig]: + all_dataset_mountpoints: List[str], + all_local_datasets: List[Optional[Path]]) -> List[DatasetConfig]: """ - Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be - skipped. + Sets up all the dataset consumption objects for the datasets provided. The returned list will have the same length + as there are non-empty azure dataset IDs. + + Valid arguments combinations: + N azure datasets, 0 or N mount points, 0 or N local datasets + :param azure_config: azure related configurations to use for model scale-out behaviour :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run. :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points. + :param all_local_datasets: The paths for all local versions of the datasets. :return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids, omitting datasets with an empty name. """ datasets: List[DatasetConfig] = [] - if len(all_dataset_mountpoints) > 0: - if len(all_azure_dataset_ids) != len(all_dataset_mountpoints): - raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) " - f"must equal the number of Azure dataset IDs ({len(all_azure_dataset_ids)})") + num_local = len(all_local_datasets) + num_azure = len(all_azure_dataset_ids) + num_mount = len(all_dataset_mountpoints) + if num_azure > 0 and (num_local == 0 or num_local == num_azure) and (num_mount == 0 or num_mount == num_azure): + # Test for valid settings: If we have N azure datasets, the local datasets and mount points need to either + # have exactly the same length, or 0. In the latter case, empty mount points and no local dataset will be + # assumed below. + count = num_azure + elif num_azure == 0 and num_mount == 0: + # No datasets in Azure at all: This is possible for runs that for example download their own data from the web. + # There can be any number of local datasets, but we are not checking that. In MLRunner.setup, there is a check + # that leaves local datasets intact if there are no Azure datasets. + return [] else: - all_dataset_mountpoints = [""] * len(all_azure_dataset_ids) - for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)): - if dataset_id: - datasets.append(DatasetConfig(name=dataset_id, - # Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs, - # setting to None works. - target_folder=mount_point or None, - use_mounting=azure_config.use_dataset_mount, - datastore=azure_config.azureml_datastore)) - elif mount_point: - raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has " - f"been provided ('{mount_point}')") + raise ValueError("Invalid dataset setup. You need to specify N entries in azure_datasets and a matching " + "number of local_datasets and dataset_mountpoints") + for i in range(count): + azure_dataset = all_azure_dataset_ids[i] if i < num_azure else "" + if not azure_dataset: + continue + mount_point = all_dataset_mountpoints[i] if i < num_mount else "" + local_dataset = all_local_datasets[i] if i < num_local else None + is_empty_azure_dataset = len(azure_dataset.strip()) == 0 + config = DatasetConfig(name=azure_dataset, + # Workaround for a bug in hi-ml 0.1.11: mount_point=="" creates invalid jobs, + # setting to None works. + target_folder=mount_point or None, + local_folder=local_dataset, + use_mounting=azure_config.use_dataset_mount, + datastore=azure_config.azureml_datastore) + if is_empty_azure_dataset: + config.name = "" + datasets.append(config) return datasets diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py index 17435a8de..ceefc0f57 100755 --- a/InnerEye/Common/fixed_paths.py +++ b/InnerEye/Common/fixed_paths.py @@ -37,8 +37,6 @@ def repository_root_directory(path: Optional[PathOrString] = None) -> Path: DEFAULT_LOGS_DIR_NAME = "logs" DEFAULT_MODEL_SUMMARIES_DIR_PATH = Path(DEFAULT_LOGS_DIR_NAME) / "model_summaries" -# The folder at the project root directory that holds datasets for local execution. -DATASETS_DIR_NAME = "datasets" ML_RELATIVE_SOURCE_PATH = os.path.join("ML") ML_RELATIVE_RUNNER_PATH = os.path.join(ML_RELATIVE_SOURCE_PATH, "runner.py") diff --git a/InnerEye/Common/generic_parsing.py b/InnerEye/Common/generic_parsing.py index 38f8eed7f..4645a3212 100644 --- a/InnerEye/Common/generic_parsing.py +++ b/InnerEye/Common/generic_parsing.py @@ -6,6 +6,7 @@ import argparse import logging +from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Set, Type, Union import param @@ -32,6 +33,50 @@ def _validate(self, val: Any) -> None: super()._validate(val) +class StringOrStringList(param.Parameter): + """ + Wrapper class to allow either a string or a list of strings. Internally represented always as a list. + """ + + def _validate(self, val: Any) -> None: + if isinstance(val, str): + return + if isinstance(val, List): + if all([isinstance(v, str) for v in val]): + return + raise ValueError(f"{val} must be a string or a list of strings") + + def set_hook(self, obj: Any, val: Any) -> Any: + """ + Modifies the value before calling the setter. Here, we are converting all strings to lists of strings. + """ + if isinstance(val, str): + return [val] + return val + + +class PathOrPathList(param.Parameter): + """ + Wrapper class to allow either a Path or a list of Paths. Internally represented always as a list. + """ + + def _validate(self, val: Any) -> None: + if isinstance(val, Path): + return + if isinstance(val, List): + if all([isinstance(v, Path) for v in val]): + return + raise ValueError(f"{val} must be a Path object or a list of paths") + + def set_hook(self, obj: Any, val: Any) -> Any: + """ + Modifies the value before calling the setter. Here, we are converting simple path to lists of path. + """ + if isinstance(val, Path): + return [val] + return val + + class IntTuple(param.NumericTuple): """ Parameter class that must always have integer values diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py index 6537539bf..ea4fc71bf 100644 --- a/InnerEye/ML/deep_learning_config.py +++ b/InnerEye/ML/deep_learning_config.py @@ -18,7 +18,7 @@ from InnerEye.Common.common_util import ModelProcessing, is_windows from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR_NAME from InnerEye.Common.generic_parsing import GenericConfig -from InnerEye.Common.type_annotations import PathOrString, TupleFloat2 +from InnerEye.Common.type_annotations import PathOrString, T, TupleFloat2 from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, create_unique_timestamp_id, \ get_best_checkpoint_path, get_recovery_checkpoint_path @@ -369,10 +369,10 @@ class DatasetParams(param.Parameterized): param.List(default=[], allow_None=False, doc="This can be used to feed in additional datasets to your custom datamodules. These will be" "mounted and made available as a list of paths in 'extra_local_datasets' when running in AML.") - extra_local_dataset_paths: List[Path] = param.List(class_=Path, default=[], allow_None=False, - doc="This can be used to feed in additional datasets " - "to your custom datamodules when running outside of Azure " - "AML.") + extra_local_dataset_paths: List[Optional[Path]] = \ + param.List(class_=Path, default=[], allow_None=False, + doc="This can be used to feed in additional datasets " + "to your custom datamodules when running outside of Azure AML.") dataset_mountpoint: str = param.String(doc="The path at which the AzureML dataset should be made available via " "mounting or downloading. This only affects jobs running in AzureML." "If empty, use a random mount/download point.") @@ -396,20 +396,29 @@ def all_azure_dataset_ids(self) -> List[str]: Returns a list with all azure dataset IDs that are specified in self.azure_dataset_id and self.extra_azure_dataset_ids """ - if not self.azure_dataset_id: - return self.extra_azure_dataset_ids - else: - return [self.azure_dataset_id] + self.extra_azure_dataset_ids + return self._concat_paths(self.azure_dataset_id, self.extra_azure_dataset_ids) def all_dataset_mountpoints(self) -> List[str]: """ Returns a list with all dataset mount points that are specified in self.dataset_mountpoint and self.extra_dataset_mountpoints """ - if not self.dataset_mountpoint: - return self.extra_dataset_mountpoints - else: - return [self.dataset_mountpoint] + self.extra_dataset_mountpoints + return self._concat_paths(self.dataset_mountpoint, self.extra_dataset_mountpoints) + + def all_local_dataset_paths(self) -> List[Path]: + """ + Returns a list with all dataset mount points that are specified in self.local_dataset and + self.extra_local_dataset_paths + """ + return self._concat_paths(self.local_dataset, self.extra_local_dataset_paths) # type: ignore + + def _concat_paths(self, item: Optional[T], items: List[T]) -> List[T]: + """ + Creates a list with the item going first (if it does not evaluate to False), then the rest of the items. + """ + if item is None or (isinstance(item, str) and not item.strip()): + return items + return [item] + items class OutputParams(param.Parameterized): diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py index 304a6f9da..eb5cea2b0 100644 --- a/InnerEye/ML/model_training.py +++ b/InnerEye/ML/model_training.py @@ -91,6 +91,7 @@ def create_lightning_trainer(container: LightningContainer, :param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer. :return: A tuple [Trainer object, diagnostic logger] """ + logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}") num_gpus = container.num_gpus_per_node() effective_num_gpus = num_gpus * num_nodes # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory). diff --git a/InnerEye/ML/normalize_and_visualize_dataset.py b/InnerEye/ML/normalize_and_visualize_dataset.py index deca45166..9ab0d6f6a 100644 --- a/InnerEye/ML/normalize_and_visualize_dataset.py +++ b/InnerEye/ML/normalize_and_visualize_dataset.py @@ -20,9 +20,9 @@ from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources from InnerEye.ML.deep_learning_config import ARGS_TXT from InnerEye.ML.photometric_normalization import PhotometricNormalization -from InnerEye.ML.run_ml import MLRunner from InnerEye.ML.utils.config_loader import ModelConfigLoader from InnerEye.ML.utils.io_util import load_images_from_dataset_source +from health_azure import DatasetConfig class NormalizeAndVisualizeConfig(GenericConfig): @@ -73,8 +73,10 @@ def main(yaml_file_path: Path) -> None: In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below). """ config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path) - runner = MLRunner(config, azure_config=runner_config) - local_dataset = runner.download_or_use_existing_dataset(config.azure_dataset_id, config.local_dataset) + dataset_config = DatasetConfig(name=config.azure_dataset_id, + local_folder=config.local_dataset, + use_mounting=True) + local_dataset, mount_context = dataset_config.to_input_dataset_local(workspace=runner_config.get_workspace()) dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME) normalizer_config = NormalizeAndVisualizeConfig(**args) actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py index b74706ef0..a18712831 100644 --- a/InnerEye/ML/run_ml.py +++ b/InnerEye/ML/run_ml.py @@ -15,7 +15,6 @@ import torch.multiprocessing from azureml._restclient.constants import RunStatus from azureml.core import Model, Run, model -from azureml.data import FileDataset from pytorch_lightning import LightningModule, seed_everything from pytorch_lightning.core.datamodule import LightningDataModule from torch.utils.data import DataLoader @@ -60,60 +59,12 @@ from InnerEye.ML.visualizers.plot_cross_validation import \ get_config_and_results_for_offline_runs, plot_cross_validation_from_files from health_azure import AzureRunInfo -from health_azure.datasets import get_or_create_dataset from health_azure.utils import ENVIRONMENT_VERSION, create_run_recovery_id, is_global_rank_zero, merge_conda_files ModelDeploymentHookSignature = Callable[[LightningContainer, AzureConfig, Model, ModelProcessing], Any] PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None] -def download_dataset(azure_dataset_id: str, - target_folder: Path, - dataset_csv: str, - azure_config: AzureConfig) -> Path: - """ - Downloads or checks for an existing dataset on the executing machine. If a local_dataset is supplied and the - directory is present, return that. Otherwise, download the dataset specified by the azure_dataset_id from the - AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`, - in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder - contains a dataset csv file, no download is started. - :param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace. - :param target_folder: The folder in which to download the dataset from Azure. - :param dataset_csv: Name of the csv file describing the dataset. This is only used to check if the dataset has been - downloaded already. - :param azure_config: All Azure-related configuration options. - :return: A path on the local machine that contains the dataset. - """ - logging.info("Trying to download dataset via AzureML datastore now.") - azure_dataset = get_or_create_dataset(workspace=azure_config.get_workspace(), - datastore_name=azure_config.azureml_datastore, - dataset_name=azure_dataset_id) - if not isinstance(azure_dataset, FileDataset): - raise ValueError(f"Expected to get a FileDataset, but got {type(azure_dataset)}") - # The downloaded dataset may already exist from a previous run. - expected_dataset_path = target_folder / azure_dataset_id - logging.info(f"Model training will use dataset '{azure_dataset_id}' in Azure.") - if expected_dataset_path.is_dir(): - if dataset_csv: - if (expected_dataset_path / dataset_csv).is_file(): - logging.info(f"The file {dataset_csv} is already downloaded in {expected_dataset_path}. Skipping.") - return expected_dataset_path - else: - existing_files = sum(1 for _ in expected_dataset_path.rglob("*")) - if existing_files > 1: - logging.info(f"There are already {existing_files} files in {expected_dataset_path}. Skipping.") - return expected_dataset_path - - logging.info("Starting to download the dataset - WARNING, this could take very long!") - with logging_section("Downloading dataset"): - t0 = time.perf_counter() - azure_dataset.download(target_path=str(expected_dataset_path), overwrite=False) - t1 = time.perf_counter() - t0 - logging.info(f"Azure dataset '{azure_dataset_id}' downloaded in {t1} seconds") - logging.info(f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}") - return expected_dataset_path - - def check_dataset_folder_exists(local_dataset: PathOrString) -> Path: """ Checks if a folder with a local dataset exists. If it does exist, return the argument converted to a Path instance. @@ -202,30 +153,15 @@ def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None: if self._has_setup_run: return if (not self.azure_config.only_register_model) and azure_run_info: - dataset_index = 0 - # Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails - # and config.local_dataset was not already set. + # Set up the paths to the datasets. azure_run_info already has all necessary information, using either + # the provided local datasets for VM runs, or the AzureML mount points when running in AML. # This must happen before container setup because that could already read datasets. - if self.container.azure_dataset_id: - mounted_dataset = azure_run_info.input_datasets[dataset_index] - if mounted_dataset is None: - mounted_dataset = self.download_or_use_existing_dataset(self.container.azure_dataset_id, - self.container.local_dataset) - self.container.local_dataset = mounted_dataset - dataset_index += 1 - if self.container.extra_azure_dataset_ids: - num_extra_local_datasets = len(self.container.extra_local_dataset_paths) + if len(azure_run_info.input_datasets) > 0: + self.container.local_dataset = check_dataset_folder_exists(azure_run_info.input_datasets[0]) extra_locals: List[Path] = [] - for i, extra_azure_id in enumerate(self.container.extra_azure_dataset_ids): - if num_extra_local_datasets > 0 and i < (num_extra_local_datasets - 1): - raise ValueError(f"The model refers to an Azure dataset '{extra_azure_id}' at index {i}, " - f"but there are not enough local datasets given ") - mounted_dataset = azure_run_info.input_datasets[dataset_index] - if mounted_dataset is None: - local = None if num_extra_local_datasets == 0 else self.container.extra_local_dataset_paths[i] - mounted_dataset = self.download_or_use_existing_dataset(extra_azure_id, local_dataset=local) - extra_locals.append(mounted_dataset) - self.container.extra_local_dataset_paths = extra_locals + for extra_datasets in azure_run_info.input_datasets[1:]: + extra_locals.append(check_dataset_folder_exists(extra_datasets)) + self.container.extra_local_dataset_paths = extra_locals # type: ignore # Ensure that we use fixed seeds before initializing the PyTorch models seed_everything(self.container.get_effective_random_seed()) # Creating the folder structure must happen before the LightningModule is created, because the output @@ -551,31 +487,6 @@ def create_activation_maps(self) -> None: activation_maps.extract_activation_maps(self.innereye_config) # type: ignore logging.info("Successfully extracted and saved activation maps") - def download_or_use_existing_dataset(self, - azure_dataset_id: Optional[str], - local_dataset: Optional[Path]) -> Path: - """ - Makes the dataset that the model uses available on the executing machine. If the present training run is outside - of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be - taken. If a dataset is specified in `azure_dataset_id`, it will attempt to download the dataset from Azure - into the local repository, in the "datasets" folder. - :param azure_dataset_id: id of the dataset in AML workspace - :param local_dataset: alternatively local path for this dataset - :returns: The path of the dataset on the executing machine. - """ - if not self.is_offline_run: - raise ValueError("This function should only be called in runs outside AzureML.") - if local_dataset: - return check_dataset_folder_exists(local_dataset) - if azure_dataset_id: - dataset_csv = "" - if isinstance(self.model_config, DeepLearningConfig): - dataset_csv = self.model_config.dataset_csv - return download_dataset(azure_dataset_id=azure_dataset_id, - target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME, - dataset_csv=dataset_csv, azure_config=self.azure_config) - raise ValueError("The model must contain either local_dataset or azure_dataset_id") - def set_multiprocessing_start_method(self) -> None: """ Set the (PyTorch) multiprocessing start method. diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py index 74bb67d7a..c35d21eea 100755 --- a/InnerEye/ML/runner.py +++ b/InnerEye/ML/runner.py @@ -248,9 +248,12 @@ def submit_to_azureml_if_needed(self) -> AzureRunInfo: if not self.lightning_container.regression_test_folder: ignored_folders.append("RegressionTestResults") - input_datasets = create_dataset_configs(self.azure_config, - all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(), - all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints()) + all_local_datasets = self.lightning_container.all_local_dataset_paths() + input_datasets = \ + create_dataset_configs(self.azure_config, + all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(), + all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints(), + all_local_datasets=all_local_datasets) # type: ignore def after_submission_hook(azure_run: Run) -> None: """ diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py index 755a5a928..a445743fe 100644 --- a/InnerEye/ML/utils/checkpoint_handling.py +++ b/InnerEye/ML/utils/checkpoint_handling.py @@ -91,6 +91,10 @@ def get_recovery_or_checkpoint_path_train(self) -> Optional[Path]: the latest checkpoint will be present in this folder too. :return: Constructed checkpoint path to recover from. """ + checkpoints = list(self.container.checkpoint_folder.rglob("*")) + logging.debug(f"Available checkpoints: {len(checkpoints)}") + for f in checkpoints: + logging.debug(f) recovery = find_recovery_checkpoint_and_epoch(self.container.checkpoint_folder) if recovery is not None: local_recovery_path, recovery_epoch = recovery diff --git a/Tests/Azure/test_azure_config.py b/Tests/Azure/test_azure_config.py index 7fc4e300d..b889779f7 100644 --- a/Tests/Azure/test_azure_config.py +++ b/Tests/Azure/test_azure_config.py @@ -2,10 +2,14 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +from pathlib import Path, PosixPath +from typing import List + import pytest from InnerEye.Azure.azure_config import AzureConfig from InnerEye.Azure.azure_runner import create_dataset_configs +from InnerEye.ML.deep_learning_config import DatasetParams from Tests.ML.util import get_default_azure_config @@ -17,35 +21,98 @@ def test_validate() -> None: def test_dataset_consumption1() -> None: """ - Test that an empty dataset ID will not produce any dataset consumption. + Creating datasets, case 1: Azure datasets given, no local folders or mount points + :return: """ azure_config = get_default_azure_config() - assert len(create_dataset_configs(azure_config, [""], [""])) == 0 + datasets = create_dataset_configs(azure_config, + all_azure_dataset_ids=["1", "2"], + all_dataset_mountpoints=[], + all_local_datasets=[]) + assert len(datasets) == 2 + assert datasets[0].name == "1" + assert datasets[1].name == "2" + for i in range(2): + assert datasets[i].local_folder is None + assert datasets[i].target_folder is None + + # Two error cases: number of mount points or number of local datasets does not match + with pytest.raises(ValueError) as ex: + create_dataset_configs(azure_config, + all_azure_dataset_ids=["1", "2"], + all_dataset_mountpoints=["mp"], + all_local_datasets=[]) + assert "Invalid dataset setup" in str(ex) + with pytest.raises(ValueError) as ex: + create_dataset_configs(azure_config, + all_azure_dataset_ids=["1", "2"], + all_dataset_mountpoints=[], + all_local_datasets=[Path("local")]) + assert "Invalid dataset setup" in str(ex) def test_dataset_consumption2() -> None: """ - Test that given mount point with empty dataset ID raises an error + Creating datasets, case 2: Azure datasets, local folders and mount points given """ azure_config = get_default_azure_config() - with pytest.raises(ValueError) as ex: - create_dataset_configs(azure_config, [""], ["foo"]) - assert "but a mount point has been provided" in str(ex) + datasets = create_dataset_configs(azure_config, + all_azure_dataset_ids=["1", "2"], + all_dataset_mountpoints=["mp1", "mp2"], + all_local_datasets=[Path("l1"), Path("l2")]) + assert len(datasets) == 2 + assert datasets[0].name == "1" + assert datasets[1].name == "2" + assert datasets[0].local_folder == Path("l1") + assert datasets[1].local_folder == Path("l2") + assert datasets[0].target_folder == PosixPath("mp1") + assert datasets[1].target_folder == PosixPath("mp2") def test_dataset_consumption3() -> None: """ - Test that a matching number of mount points is created. + Creating datasets, case 3: local datasets only. This should generate no results """ azure_config = get_default_azure_config() - assert len(create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], [])) == 2 - + datasets = create_dataset_configs(azure_config, + all_azure_dataset_ids=[], + all_dataset_mountpoints=[], + all_local_datasets=[Path("l1"), Path("l2")]) + assert len(datasets) == 0 def test_dataset_consumption4() -> None: """ - Test error handling for number of mount points. + Creating datasets, case 4: no datasets at all """ azure_config = get_default_azure_config() - with pytest.raises(ValueError) as ex: - create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], ["foo"]) - assert "must equal the number of Azure dataset IDs" in str(ex) + datasets = create_dataset_configs(azure_config, + all_azure_dataset_ids=[], + all_dataset_mountpoints=[], + all_local_datasets=[]) + assert len(datasets) == 0 + + +@pytest.mark.parametrize(("first", "extra", "expected"), + [(None, [Path("foo")], [Path("foo")]), + (Path("bar"), [Path("foo")], [Path("bar"), Path("foo")])]) +def test_dataset_params_local(first: Path, extra: List[Path], expected: List[Path]) -> None: + """ + Local datasets that are None should be excluded from all_local_dataset_paths + """ + p = DatasetParams() + p.local_dataset = first + p.extra_local_dataset_paths = extra # type: ignore + assert p.all_local_dataset_paths() == expected + + +@pytest.mark.parametrize(("first", "extra", "expected"), + [("", ["foo"], ["foo"]), + ("bar", ["foo"], ["bar", "foo"])]) +def test_dataset_params_azure(first: str, extra: List[str], expected: List[Path]) -> None: + """ + Azure datasets that are None or an empty string should be excluded from all_azure_dataset_ids + """ + p = DatasetParams() + p.azure_dataset_id = first + p.extra_azure_dataset_ids = extra + assert p.all_azure_dataset_ids() == expected diff --git a/Tests/Common/test_commandline_parsing.py b/Tests/Common/test_commandline_parsing.py index f6b8ad42f..f827dd5d8 100644 --- a/Tests/Common/test_commandline_parsing.py +++ b/Tests/Common/test_commandline_parsing.py @@ -24,7 +24,8 @@ def test_create_ml_runner_args(is_container: bool, test_output_dirs: OutputFolderForTests, is_offline_run: bool, - set_output_to: bool) -> None: + set_output_to: bool, + tmpdir: Path) -> None: """Test round trip parsing of commandline arguments: From arguments to the Azure runner to the arguments of the ML runner, checking that whatever is passed on can be correctly parsed. It also checks that the output files go into the right place @@ -32,10 +33,11 @@ def test_create_ml_runner_args(is_container: bool, logging_to_stdout() model_name = "DummyContainerWithPlainLightning" if is_container else "DummyModel" if is_container: - dataset_folder = Path("download") + dataset_folder = tmpdir else: local_dataset = DummyModel().local_dataset assert local_dataset is not None + assert local_dataset.is_dir() dataset_folder = local_dataset outputs_folder = test_output_dirs.root_dir project_root = fixed_paths.repository_root_directory() @@ -56,22 +58,20 @@ def test_create_ml_runner_args(is_container: bool, with mock.patch("sys.argv", [""] + args_list): with mock.patch("InnerEye.ML.deep_learning_config.is_offline_run_context", return_value=is_offline_run): with mock.patch("InnerEye.ML.run_ml.MLRunner.run", return_value=None): - with mock.patch("InnerEye.ML.run_ml.MLRunner.download_or_use_existing_dataset", - return_value=dataset_folder): - runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE) - runner.parse_and_load_model() - # Only when calling config.create_filesystem we expect to see the correct paths, and this happens - # inside run_in_situ - azure_run_info = AzureRunInfo(input_datasets=[None], - output_datasets=[None], - run=None, - is_running_in_azure_ml=False, - output_folder=Path.cwd(), - logs_folder=Path.cwd(), - mount_contexts=[]) - runner.run_in_situ(azure_run_info) - azure_config = runner.azure_config - container_or_legacy_config = runner.lightning_container if is_container else runner.model_config + runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE) + runner.parse_and_load_model() + azure_run_info = AzureRunInfo(input_datasets=[dataset_folder], + output_datasets=[None], + run=None, + is_running_in_azure_ml=False, + output_folder=Path.cwd(), + logs_folder=Path.cwd(), + mount_contexts=[]) + # Only when calling config.create_filesystem we expect to see the correct paths, and this happens + # inside run_in_situ + runner.run_in_situ(azure_run_info) + azure_config = runner.azure_config + container_or_legacy_config = runner.lightning_container if is_container else runner.model_config assert azure_config.model == model_name assert container_or_legacy_config is not None if not is_container: @@ -90,6 +90,8 @@ def test_create_ml_runner_args(is_container: bool, assert container_or_legacy_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME) assert not hasattr(container_or_legacy_config, "azureml_datastore") + # Container setup should copy the path of the local dataset from AzureRunInfo to the local_dataset field + assert container_or_legacy_config.local_dataset == dataset_folder def test_overridable_properties() -> None: diff --git a/Tests/Common/test_generic_parsing.py b/Tests/Common/test_generic_parsing.py index 92f55ebe8..547f80cf3 100644 --- a/Tests/Common/test_generic_parsing.py +++ b/Tests/Common/test_generic_parsing.py @@ -3,12 +3,14 @@ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ from enum import Enum +from pathlib import Path from typing import Any, List, Optional, Tuple import param import pytest -from InnerEye.Common.generic_parsing import GenericConfig, IntTuple, create_from_matching_params +from InnerEye.Common.generic_parsing import GenericConfig, IntTuple, PathOrPathList, StringOrStringList, \ + create_from_matching_params class ParamEnum(Enum): @@ -34,6 +36,39 @@ class ParamClass(GenericConfig): constant: str = param.String("Nope", constant=True) +class ParamOrList(GenericConfig): + string_param: List[str] = StringOrStringList(default=[]) + path_param: List[Path] = PathOrPathList(default=[]) + + +def test_string_or_list() -> None: + c = ParamOrList() + v1 = "foo" + c.string_param = v1 + assert c.string_param == [v1] + v2 = ["foo", "bar"] + c.string_param = v2 + assert c.string_param == v2 + with pytest.raises(ValueError): + c.string_param = 1 + with pytest.raises(ValueError): + c.string_param = [1] + + +def test_path_or_list() -> None: + c = ParamOrList() + v1 = Path.cwd() + c.path_param = v1 + assert c.path_param == [v1] + v2 = [Path("foo"), Path("bar")] + c.path_param = v2 + assert c.path_param == v2 + with pytest.raises(ValueError): + c.path_param = 1 + with pytest.raises(ValueError): + c.path_param = [1] + + def test_overridable_parameter() -> None: """ Test to check overridable parameters are correctly identified. diff --git a/Tests/ML/test_download_upload.py b/Tests/ML/test_download_upload.py index d6afa1ac7..4f7f67b42 100644 --- a/Tests/ML/test_download_upload.py +++ b/Tests/ML/test_download_upload.py @@ -3,29 +3,18 @@ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ import logging -import shutil from pathlib import Path -from typing import Any, List, Optional -from unittest import mock +from typing import List import pytest from InnerEye.Azure.azure_config import AzureConfig -from InnerEye.Common import fixed_paths -from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_section, logging_to_stdout -from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path +from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_to_stdout from InnerEye.Common.output_directories import OutputFolderForTests -from InnerEye.ML.common import DATASET_CSV_FILE_NAME -from InnerEye.ML.deep_learning_config import DeepLearningConfig -from InnerEye.ML.lightning_container import LightningContainer from InnerEye.ML.model_config_base import ModelConfigBase -from InnerEye.ML.run_ml import MLRunner from InnerEye.ML.utils.run_recovery import RunRecovery from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, FALLBACK_SINGLE_RUN, get_most_recent_run -from Tests.ML.configs.DummyModel import DummyModel -from Tests.ML.configs.lightning_test_containers import DummyContainerWithDatasets from Tests.ML.util import get_default_azure_config -from health_azure import AzureRunInfo logging_to_stdout(logging.DEBUG) @@ -76,188 +65,3 @@ def test_download_best_checkpoints_ensemble_run(test_output_dirs: OutputFolderFo assert (other_runs_folder / child).is_dir(), "Child run folder does not exist" for checkpoint in run_recovery.get_best_checkpoint_paths(): assert checkpoint.is_file(), f"File {checkpoint} does not exist" - - -def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> None: - dataset_name = "test-dataset" - config = DummyModel() - config.local_dataset = None - config.azure_dataset_id = "" - azure_config = get_default_azure_config() - runner = MLRunner(config, azure_config=azure_config) - # If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail. - # This mounting call must happen before any other operations on the container, because already the model - # creation may need access to the dataset. - with pytest.raises(ValueError) as ex: - runner.setup() - assert ex.value.args[0] == "Expecting that a dataset is available here." - runner.project_root = test_output_dirs.root_dir - - # Pointing the model to a dataset folder that does not exist should raise an Exception - fake_folder = runner.project_root / "foo" - runner.container.local_dataset = fake_folder - with pytest.raises(FileNotFoundError): - runner.download_or_use_existing_dataset(runner.container.azure_dataset_id, runner.container.local_dataset) - - # If the local dataset folder exists, mount_or_download_dataset should not do anything. - fake_folder.mkdir() - local_dataset = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id, - runner.container.local_dataset) - assert local_dataset == fake_folder - - # Pointing the model to a dataset in Azure should trigger a download - runner.container.local_dataset = None - runner.container.azure_dataset_id = dataset_name - with logging_section("Starting download"): - result_path = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id, - runner.container.local_dataset) - # Download goes into / "datasets" / "test_dataset" - expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name - assert result_path == expected_path - assert result_path.is_dir() - dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME - assert dataset_csv.is_file() - # Check that each individual file in the dataset is present - for folder in [1, 10]: - sub_folder = result_path / str(folder) - sub_folder.is_dir() - for file in ["esophagus", "heart", "lung_l", "lung_r", "spinalcord"]: - f = (sub_folder / file).with_suffix(".nii.gz") - assert f.is_file() - - -def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests, - is_offline_run: bool, - local_dataset: Optional[Path], - azure_dataset: str, - is_lightning_model: bool) -> LightningContainer: - config: Optional[DeepLearningConfig] = None - container: Optional[LightningContainer] = None - if is_lightning_model: - container = DummyContainerWithDatasets() - container.azure_dataset_id = azure_dataset - container.local_dataset = local_dataset - else: - config = DummyModel() - config.azure_dataset_id = azure_dataset - config.local_dataset = local_dataset - # The legacy InnerEye models require an existing dataset_csv file present in the dataset folder. Create that. - download_path = test_output_dirs.root_dir / "downloaded" - mount_path = test_output_dirs.root_dir / "mounted" - if not is_lightning_model: - train_and_test_data = "train_and_test_data" - for path in [download_path, mount_path, test_output_dirs.root_dir]: - # If destination folder exists, delete content to ensure consistency and avoid 'FileExistsError' - if (path / train_and_test_data).is_dir(): - shutil.rmtree(path / train_and_test_data) - - # Creates directory structure and copy data - shutil.copytree(full_ml_test_data_path(train_and_test_data), path / train_and_test_data) - # Copy 'data.csv' file - shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), path / DATASET_CSV_FILE_NAME) - - with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run): - with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path): - runner = MLRunner(config, container=container, - azure_config=None, project_root=test_output_dirs.root_dir) - path_from_aml: List[Optional[Path]] = [None] if is_offline_run else [mount_path] - runner.setup(azure_run_info=AzureRunInfo(input_datasets=path_from_aml, - output_datasets=[], - run=None, - is_running_in_azure_ml=False, - output_folder=Path(), - logs_folder=Path(), - mount_contexts=[] - )) - return runner.container - - -@pytest.mark.parametrize(("is_lightning_model", "expected_error"), - [ - # A built-in InnerEye model must have either local dataset or azure dataset provided. - (False, "Expecting that a dataset is available here."), - # ... but this is OK for Lightning container models. A Lightning container could simply - # download its data from the web before training. - (True, "") - ]) -def test_mount_failing_offline_runs(test_output_dirs: OutputFolderForTests, - is_lightning_model: bool, - expected_error: str) -> None: - """ - Test cases when MLRunner.mount_or_download_dataset raises an exception, when running outside AzureML. - """ - - def run() -> Any: - return _test_mount_for_lightning_container(test_output_dirs=test_output_dirs, - is_offline_run=True, - local_dataset=None, - azure_dataset="", - is_lightning_model=is_lightning_model) - - if expected_error: - with pytest.raises(ValueError) as ex: - run() - assert expected_error in str(ex) - else: - assert run().local_dataset is None - - -def test_mount_in_azureml1(test_output_dirs: OutputFolderForTests) -> None: - """ - Test cases when MLRunner.mount_or_download_dataset runs inside AzureML. - """ - container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs, - is_offline_run=False, - local_dataset=None, - azure_dataset="foo", - is_lightning_model=False) - assert "mounted" in str(container.local_dataset) - - -def test_mount_in_azureml2(test_output_dirs: OutputFolderForTests) -> None: - """ - Test cases when MLRunner.mount_or_download_dataset runs inside AzureML. - """ - container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs, - is_offline_run=False, - local_dataset=None, - azure_dataset="", - is_lightning_model=True) - assert container.local_dataset is None - - -def test_mount_or_download(test_output_dirs: OutputFolderForTests) -> None: - """ - Tests the different combinations of local and Azure datasets, with Innereye built-in and container models. - """ - root = test_output_dirs.root_dir - for is_lightning_model in [True, False]: - # With runs outside of AzureML, an AML dataset should get downloaded. - container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs, - is_offline_run=True, - local_dataset=None, - azure_dataset="foo", - is_lightning_model=is_lightning_model) - assert "downloaded" in str(container.local_dataset) - # For all InnerEye built-in models, the paths from container level need to be copied down to legacy config - # level. - if not is_lightning_model: - assert container.config.local_dataset == container.local_dataset - # With runs in AzureML, an AML dataset should get mounted. - container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs, - is_offline_run=False, - local_dataset=None, - azure_dataset="foo", - is_lightning_model=is_lightning_model) - assert "mounted" in str(container.local_dataset) - if not is_lightning_model: - assert container.config.local_dataset == container.local_dataset - - container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs, - is_offline_run=True, - local_dataset=root, - azure_dataset="", - is_lightning_model=is_lightning_model) - assert container.local_dataset == root - if not is_lightning_model: - assert container.config.local_dataset == container.local_dataset diff --git a/Tests/ML/test_lightning_containers.py b/Tests/ML/test_lightning_containers.py index cca9eb79c..109901a97 100644 --- a/Tests/ML/test_lightning_containers.py +++ b/Tests/ML/test_lightning_containers.py @@ -256,7 +256,7 @@ def _create_container(extra_local_dataset_paths: List[Path] = [], extra_azure_dataset_ids: List[str] = []) -> LightningContainer: container = DummyContainerWithModel() container.local_dataset = test_output_dirs.root_dir - container.extra_local_dataset_paths = extra_local_dataset_paths + container.extra_local_dataset_paths = extra_local_dataset_paths # type: ignore container.extra_azure_dataset_ids = extra_azure_dataset_ids runner = MLRunner(model_config=None, container=container) runner.setup()