Split validation and test infer config (#502)

JonathanTripp · web-flow · commit cab68ccc61ab · 2021-07-05T16:25:49.000+01:00
Split validation, test, ensemble inference flags
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,10 +13,12 @@ created.
 ## Upcoming
 
 ### Added
+- ([#502](https://github.com/microsoft/InnerEye-DeepLearning/pull/502)) More flags for fine control of when to run inference.
 - ([#492](https://github.com/microsoft/InnerEye-DeepLearning/pull/492)) Adding capability for regression tests for test
 jobs that run in AzureML.
 
 ### Changed
+- ([#502](https://github.com/microsoft/InnerEye-DeepLearning/pull/502)) Renamed command line option 'perform_training_set_inference' to 'inference_on_train_set'. Replaced command line option 'perform_validation_and_test_set_inference' with the pair of options 'inference_on_val_set' and 'inference_on_test_set'.
 - ([#496](https://github.com/microsoft/InnerEye-DeepLearning/pull/496)) All plots are now saved as PNG, rather than JPG.
 - ([#497](https://github.com/microsoft/InnerEye-DeepLearning/pull/497)) Reducing the size of the code snapshot that
 gets uploaded to AzureML, by skipping all test folders.
diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py
@@ -19,7 +19,7 @@
 from azureml.train.hyperdrive import HyperDriveConfig
 from git import Repo
 
-from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context
+from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context, remove_arg
 from InnerEye.Azure.secrets_handling import SecretsHandling, read_all_settings
 from InnerEye.Common import fixed_paths
 from InnerEye.Common.generic_parsing import GenericConfig
@@ -324,31 +324,7 @@ def set_script_params_except_submit_flag(self) -> None:
         Populates the script_param field of the present object from the arguments in sys.argv, with the exception
         of the "azureml" flag.
         """
-        args = sys.argv[1:]
-        submit_flag = f"--{AZURECONFIG_SUBMIT_TO_AZUREML}"
-        retained_args = []
-        i = 0
-        while i < len(args):
-            arg = args[i]
-            if arg.startswith(submit_flag):
-                if len(arg) == len(submit_flag):
-                    # The commandline argument is "--azureml", with something possibly following: This can either be
-                    # "--azureml True" or "--azureml --some_other_param"
-                    if i < (len(args) - 1):
-                        # If the next argument starts with a "-" then assume that it does not belong to the --azureml
-                        # flag. If there is no "-", assume it belongs to the --azureml flag, and skip both
-                        if not args[i + 1].startswith("-"):
-                            i = i + 1
-                elif arg[len(submit_flag)] == "=":
-                    # The commandline argument is "--azureml=True" or "--azureml=False": Continue with next arg
-                    pass
-                else:
-                    # The argument list contains a flag like "--azureml_foo": Keep that.
-                    retained_args.append(arg)
-            else:
-                retained_args.append(arg)
-            i = i + 1
-        self.script_params = retained_args
+        self.script_params = remove_arg(AZURECONFIG_SUBMIT_TO_AZUREML, sys.argv[1:])
 
 
 @dataclass
diff --git a/InnerEye/Azure/azure_util.py b/InnerEye/Azure/azure_util.py
@@ -455,3 +455,43 @@ def step_up_directories(path: Path) -> Generator[Path, None, None]:
         if parent == path:
             break
         path = parent
+
+
+def remove_arg(arg: str, args: List[str]) -> List[str]:
+    """
+    Remove an argument from a list of arguments. The argument list is assumed to contain
+    elements of the form:
+    "-a", "--arg1", "--arg2", "value2", or "--arg3=value"
+    If there is an item matching "--arg" then it will be removed from the list.
+
+    :param arg: Argument to look for.
+    :param args: List of arguments to scan.
+    :return: List of arguments with --arg removed, if present.
+    """
+    arg_opt = f"--{arg}"
+    no_arg_opt = f"--no-{arg}"
+    retained_args = []
+    i = 0
+    while i < len(args):
+        arg = args[i]
+        if arg.startswith(arg_opt):
+            if len(arg) == len(arg_opt):
+                # The commandline argument is "--arg", with something possibly following: This can either be
+                # "--arg_opt value" or "--arg_opt --some_other_param"
+                if i < (len(args) - 1):
+                    # If the next argument starts with a "-" then assume that it does not belong to the --arg
+                    # argument. If there is no "-", assume it belongs to the --arg_opt argument, and skip both
+                    if not args[i + 1].startswith("-"):
+                        i = i + 1
+            elif arg[len(arg_opt)] == "=":
+                # The commandline argument is "--arg=value": Continue with next arg
+                pass
+            else:
+                # The argument list contains an argument like "--arg_other_param": Keep that.
+                retained_args.append(arg)
+        elif arg == no_arg_opt:
+            pass
+        else:
+            retained_args.append(arg)
+        i = i + 1
+    return retained_args
diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_container.py b/InnerEye/ML/SSL/lightning_containers/ssl_container.py
@@ -121,8 +121,9 @@ def setup(self) -> None:
                                                            dataset_path=self.local_dataset,
                                                            batch_size=self.ssl_training_batch_size)})
         self.data_module: InnerEyeDataModuleTypes = self.get_data_module()
-        self.perform_validation_and_test_set_inference = False
-        if self.number_of_cross_validation_splits > 1:
+        self.inference_on_val_set = False
+        self.inference_on_test_set = False
+        if self.perform_cross_validation:
             raise NotImplementedError("Cross-validation logic is not implemented for this module.")
 
     def _load_config(self) -> None:
diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -5,17 +5,16 @@
 from __future__ import annotations
 
 import logging
-from enum import Enum, unique
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
 import param
+from enum import Enum, unique
 from pandas import DataFrame
 from param import Parameterized
+from pathlib import Path
+from typing import Any, Dict, List, Optional
 
 from InnerEye.Azure.azure_util import DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, RUN_CONTEXT, is_offline_run_context
 from InnerEye.Common import fixed_paths
-from InnerEye.Common.common_util import is_windows
+from InnerEye.Common.common_util import ModelProcessing, is_windows
 from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR_NAME
 from InnerEye.Common.generic_parsing import GenericConfig
 from InnerEye.Common.type_annotations import PathOrString, TupleFloat2
@@ -199,14 +198,24 @@ class WorkflowParams(param.Parameterized):
     cross_validation_split_index: int = param.Integer(DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, bounds=(-1, None),
                                                       doc="The index of the cross validation fold this model is "
                                                           "associated with when performing k-fold cross validation")
-    perform_training_set_inference: bool = \
-        param.Boolean(False,
-                      doc="If True, run full image inference on the training set at the end of training. If False and "
-                          "perform_validation_and_test_set_inference is True (default), only run inference on "
-                          "validation and test set. If both flags are False do not run inference.")
-    perform_validation_and_test_set_inference: bool = \
-        param.Boolean(True,
-                      doc="If True (default), run full image inference on validation and test set after training.")
+    inference_on_train_set: Optional[bool] = \
+        param.Boolean(None,
+                      doc="If set, enable/disable full image inference on training set after training.")
+    inference_on_val_set: Optional[bool] = \
+        param.Boolean(None,
+                      doc="If set, enable/disable full image inference on validation set after training.")
+    inference_on_test_set: Optional[bool] = \
+        param.Boolean(None,
+                      doc="If set, enable/disable full image inference on test set after training.")
+    ensemble_inference_on_train_set: Optional[bool] = \
+        param.Boolean(None,
+                      doc="If set, enable/disable full image inference on the training set after ensemble training.")
+    ensemble_inference_on_val_set: Optional[bool] = \
+        param.Boolean(None,
+                      doc="If set, enable/disable full image inference on validation set after ensemble training.")
+    ensemble_inference_on_test_set: Optional[bool] = \
+        param.Boolean(None,
+                      doc="If set, enable/disable full image inference on test set after ensemble training.")
     weights_url: str = param.String(doc="If provided, a url from which weights will be downloaded and used for model "
                                         "initialization.")
     local_weights_path: Optional[Path] = param.ClassSelector(class_=Path,
@@ -254,6 +263,53 @@ def validate(self) -> None:
                              f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
                              f"and cross_validation_split_index={self.cross_validation_split_index}")
 
+    """ Defaults for when to run inference in the absence of any command line switches. """
+    INFERENCE_DEFAULTS: Dict[ModelProcessing, Dict[ModelExecutionMode, bool]] = {
+        ModelProcessing.DEFAULT: {
+            ModelExecutionMode.TRAIN: False,
+            ModelExecutionMode.TEST: True,
+            ModelExecutionMode.VAL: True,
+        },
+        ModelProcessing.ENSEMBLE_CREATION: {
+            ModelExecutionMode.TRAIN: False,
+            ModelExecutionMode.TEST: True,
+            ModelExecutionMode.VAL: False,
+        }
+    }
+
+    def inference_options(self) -> Dict[ModelProcessing, Dict[ModelExecutionMode, Optional[bool]]]:
+        """
+        Return a mapping from ModelProcesing and ModelExecutionMode to command line switch.
+
+        :return: Command line switch for each combination of ModelProcessing and ModelExecutionMode.
+        """
+        return {
+            ModelProcessing.DEFAULT: {
+                ModelExecutionMode.TRAIN: self.inference_on_train_set,
+                ModelExecutionMode.TEST: self.inference_on_test_set,
+                ModelExecutionMode.VAL: self.inference_on_val_set,
+            },
+            ModelProcessing.ENSEMBLE_CREATION: {
+                ModelExecutionMode.TRAIN: self.ensemble_inference_on_train_set,
+                ModelExecutionMode.TEST: self.ensemble_inference_on_test_set,
+                ModelExecutionMode.VAL: self.ensemble_inference_on_val_set,
+            }
+        }
+
+    def inference_on_set(self, model_proc: ModelProcessing, data_split: ModelExecutionMode) -> bool:
+        """
+        Returns True if inference is required for this model_proc and data_split.
+
+        :param model_proc: Whether we are testing an ensemble or single model.
+        :param data_split: Indicates which of the 3 sets (training, test, or validation) is being processed.
+        :return: True if inference required.
+        """
+        inference_option = self.inference_options()[model_proc][data_split]
+        if inference_option is not None:
+            return inference_option
+
+        return WorkflowParams.INFERENCE_DEFAULTS[model_proc][data_split]
+
     @property
     def is_offline_run(self) -> bool:
         """
diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
@@ -11,6 +11,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import pandas as pd
+from pytorch_lightning.core.datamodule import LightningDataModule
 import stopit
 import torch.multiprocessing
 from azureml._restclient.constants import RunStatus
@@ -120,19 +121,16 @@ def download_dataset(azure_dataset_id: str,
     return expected_dataset_path
 
 
-def log_metrics(val_metrics: Optional[InferenceMetricsForSegmentation],
-                test_metrics: Optional[InferenceMetricsForSegmentation],
-                train_metrics: Optional[InferenceMetricsForSegmentation],
+def log_metrics(metrics: Dict[ModelExecutionMode, InferenceMetrics],
                 run_context: Run) -> None:
     """
     Log metrics for each split to the provided run, or the current run context if None provided
-    :param val_metrics: Inference results for the validation split
-    :param test_metrics: Inference results for the test split
-    :param train_metrics: Inference results for the train split
+    :param metrics: Dictionary of inference results for each split.
     :param run_context: Run for which to log the metrics to, use the current run context if None provided
     """
-    for split in [x for x in [val_metrics, test_metrics, train_metrics] if x]:
-        split.log_metrics(run_context)
+    for split in metrics.values():
+        if isinstance(split, InferenceMetricsForSegmentation):
+            split.log_metrics(run_context)
 
 
 class MLRunner:
@@ -390,7 +388,7 @@ def run(self) -> None:
 
                 # If this is an cross validation run, and the present run is child run 0, then wait for the sibling
                 # runs, build the ensemble model, and write a report for that.
-                if self.container.number_of_cross_validation_splits > 0:
+                if self.container.perform_cross_validation:
                     should_wait_for_other_child_runs = (not self.is_offline_run) and \
                                                        self.container.cross_validation_split_index == 0
                     if should_wait_for_other_child_runs:
@@ -420,10 +418,24 @@ def is_normal_run_or_crossval_child_0(self) -> bool:
         """
         Returns True if the present run is a non-crossvalidation run, or child run 0 of a crossvalidation run.
         """
-        if self.container.number_of_cross_validation_splits > 0:
+        if self.container.perform_cross_validation:
             return self.container.cross_validation_split_index == 0
         return True
 
+    @staticmethod
+    def lightning_data_module_dataloaders(data: LightningDataModule) -> Dict[ModelExecutionMode, Callable]:
+        """
+        Given a lightning data module, return a dictionary of dataloader for each model execution mode.
+
+        :param data: Lightning data module.
+        :return: Data loader for each model execution mode.
+        """
+        return {
+            ModelExecutionMode.TEST: data.test_dataloader,
+            ModelExecutionMode.VAL: data.val_dataloader,
+            ModelExecutionMode.TRAIN: data.train_dataloader
+        }
+
     def run_inference_for_lightning_models(self, checkpoint_paths: List[Path]) -> None:
         """
         Run inference on the test set for all models that are specified via a LightningContainer.
@@ -439,11 +451,10 @@ def run_inference_for_lightning_models(self, checkpoint_paths: List[Path]) -> No
             # Read the data modules before changing the working directory, in case the code relies on relative paths
             data = self.container.get_inference_data_module()
             dataloaders: List[Tuple[DataLoader, ModelExecutionMode]] = []
-            if self.container.perform_validation_and_test_set_inference:
-                dataloaders.append((data.test_dataloader(), ModelExecutionMode.TEST))  # type: ignore
-                dataloaders.append((data.val_dataloader(), ModelExecutionMode.VAL))  # type: ignore
-            if self.container.perform_training_set_inference:
-                dataloaders.append((data.train_dataloader(), ModelExecutionMode.TRAIN))  # type: ignore
+            data_dataloaders = MLRunner.lightning_data_module_dataloaders(data)
+            for data_split, dataloader in data_dataloaders.items():
+                if self.container.inference_on_set(ModelProcessing.DEFAULT, data_split):
+                    dataloaders.append((dataloader(), data_split))
             checkpoint = load_checkpoint(checkpoint_paths[0], use_gpu=self.container.use_gpu)
             lightning_model.load_state_dict(checkpoint['state_dict'])
             lightning_model.eval()
@@ -491,8 +502,8 @@ def run_inference(self, checkpoint_handler: CheckpointHandler,
         """
 
         # run full image inference on existing or newly trained model on the training, and testing set
-        test_metrics, val_metrics, _ = self.model_inference_train_and_test(checkpoint_handler=checkpoint_handler,
-                                                                           model_proc=model_proc)
+        self.model_inference_train_and_test(checkpoint_handler=checkpoint_handler,
+                                            model_proc=model_proc)
 
         self.try_compare_scores_against_baselines(model_proc)
 
@@ -752,37 +763,25 @@ def copy_file(source: Path, destination_file: str) -> None:
     def model_inference_train_and_test(self,
                                        checkpoint_handler: CheckpointHandler,
                                        model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> \
-            Tuple[Optional[InferenceMetrics], Optional[InferenceMetrics], Optional[InferenceMetrics]]:
-        train_metrics = None
-        val_metrics = None
-        test_metrics = None
+            Dict[ModelExecutionMode, InferenceMetrics]:
+        metrics: Dict[ModelExecutionMode, InferenceMetrics] = {}
 
         config = self.innereye_config
 
-        def run_model_test(data_split: ModelExecutionMode) -> Optional[InferenceMetrics]:
-            return model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler,  # type: ignore
-                              model_proc=model_proc)
-
-        if config.perform_validation_and_test_set_inference:
-            # perform inference on test set
-            test_metrics = run_model_test(ModelExecutionMode.TEST)
-            # perform inference on validation set (not for ensemble as current val is in the training fold
-            # for at least one of the models).
-            if model_proc != ModelProcessing.ENSEMBLE_CREATION:
-                val_metrics = run_model_test(ModelExecutionMode.VAL)
-
-        if config.perform_training_set_inference:
-            # perform inference on training set if required
-            train_metrics = run_model_test(ModelExecutionMode.TRAIN)
+        for data_split in ModelExecutionMode:
+            if self.container.inference_on_set(model_proc, data_split):
+                opt_metrics = model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler,
+                                         model_proc=model_proc)
+                if opt_metrics is not None:
+                    metrics[data_split] = opt_metrics
 
         # log the metrics to AzureML experiment if possible. When doing ensemble runs, log to the Hyperdrive parent run,
         # so that we get the metrics of child run 0 and the ensemble separated.
         if config.is_segmentation_model and not self.is_offline_run:
             run_for_logging = PARENT_RUN_CONTEXT if model_proc.ENSEMBLE_CREATION else RUN_CONTEXT
-            log_metrics(val_metrics=val_metrics, test_metrics=test_metrics,  # type: ignore
-                        train_metrics=train_metrics, run_context=run_for_logging)  # type: ignore
+            log_metrics(metrics=metrics, run_context=run_for_logging)  # type: ignore
 
-        return test_metrics, val_metrics, train_metrics
+        return metrics
 
     @stopit.threading_timeoutable()
     def wait_for_runs_to_finish(self, delay: int = 60) -> None:
diff --git a/Tests/ML/configs/lightning_test_containers.py b/Tests/ML/configs/lightning_test_containers.py
@@ -190,7 +190,7 @@ class DummyContainerWithModel(LightningContainer):
 
     def __init__(self) -> None:
         super().__init__()
-        self.perform_training_set_inference = True
+        self.inference_on_train_set = True
         self.num_epochs = 50
         self.l_rate = 1e-1
 
diff --git a/Tests/ML/models/test_scalar_model.py b/Tests/ML/models/test_scalar_model.py
@@ -313,8 +313,9 @@ def test_run_ml_with_segmentation_model(test_output_dirs: OutputFolderForTests)
     # This is for a bug in an earlier version of the code where the wrong execution mode was used to
     # compute the expected mask size at training time.
     config.test_crop_size = (75, 75, 75)
-    config.perform_training_set_inference = False
-    config.perform_validation_and_test_set_inference = True
+    config.inference_on_train_set = False
+    config.inference_on_val_set = True
+    config.inference_on_test_set = True
     config.set_output_to(test_output_dirs.root_dir)
     azure_config = get_default_azure_config()
     azure_config.train = True
diff --git a/Tests/ML/runners/test_runner.py b/Tests/ML/runners/test_runner.py
diff --git a/Tests/ML/test_lightning_containers.py b/Tests/ML/test_lightning_containers.py
diff --git a/azure-pipelines/build-pr.yml b/azure-pipelines/build-pr.yml
diff --git a/docs/building_models.md b/docs/building_models.md