microsoft · JonathanTripp · Jun 10, 2021 · Jun 9, 2021 · Jun 9, 2021 · Jun 9, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,7 @@ any large models anymore because data loaders ran out of memory.
 
 ### Added
 
+- ([#488](https://github.com/microsoft/InnerEye-DeepLearning/pull/488)) Better handling of missing seriesId in segmentation cross validation reports.
 - ([#454](https://github.com/microsoft/InnerEye-DeepLearning/pull/454)) Checking that labels are mutually exclusive.
 - ([#447](https://github.com/microsoft/InnerEye-DeepLearning/pull/447/)) Added a sanity check to ensure there are no
   missing channels, nor missing files. If missing channels in the csv file or filenames associated with channels are

diff --git a/InnerEye/ML/visualizers/plot_cross_validation.py b/InnerEye/ML/visualizers/plot_cross_validation.py
@@ -552,6 +552,9 @@ def convert_rows_for_comparisons(split_column_value: Optional[str],
     :return: augmented subset of the rows in df, as described
     """
     pre_len = len(df)
+    # If series id is not present, add a default value
+    if CSV_SERIES_HEADER not in dataset_df.columns:
+        dataset_df[CSV_SERIES_HEADER] = ''
     # We need the institution column to compare subjects across institutions, if it is not present with add a default
     # value
     if CSV_INSTITUTION_HEADER not in dataset_df.columns:

diff --git a/Tests/ML/visualizers/test_plot_cross_validation.py b/Tests/ML/visualizers/test_plot_cross_validation.py
@@ -3,7 +3,8 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 from pathlib import Path
-from typing import Dict, List, Optional, Set, Tuple
+import shutil
+from typing import Callable, Dict, List, Optional, Set, Tuple
 
 import pandas as pd
 import pytest
@@ -103,17 +104,72 @@ def create_file_list_for_segmentation_recovery_run(test_config_ensemble: PlotCro
                                        folder="main_1570466706163110")
 
 
+def copy_run_result_files(files: List[RunResultFiles], src_prefix_path: Path,
+                          dst_prefix_path: Path, transformer: Callable) -> List[RunResultFiles]:
+    """
+    Copy dataset_csv_files from a list of RunResultFiles to a working directory, and then
+    transform them using a callback.
+
+    :param files: List of RunResultFiles to copy.
+    :param src_prefix_path: Shared prefix path for the dataset_csv_files to be removed.
+    :param dst_prefix_path: Shared prefix path to use for the copied dataset_csv_files.
+    :param transformer: Callback function to apply to the copied dataset_csv_files.
+    :return: New list of RunResultFiles pointing at the copied files.
+    """
+    file_copies = []
+    files_copied = []
+
+    for file in files:
+        if not file.dataset_csv_file:
+            dataset_csv_file: Optional[Path] = None
+        else:
+            # Replace prefix path
+            dst_dataset_csv_file = dst_prefix_path / file.dataset_csv_file.relative_to(src_prefix_path)
+            if dst_dataset_csv_file not in files_copied:
+                dst_dataset_csv_file.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy(file.dataset_csv_file, dst_dataset_csv_file)
+                files_copied.append(dst_dataset_csv_file)
+                transformer(dst_dataset_csv_file)
+            dataset_csv_file = dst_dataset_csv_file
+
+        file_copy = RunResultFiles(execution_mode=file.execution_mode,
+                                   metrics_file=file.metrics_file,
+                                   dataset_csv_file=dataset_csv_file,
+                                   run_recovery_id=file.run_recovery_id,
+                                   split_index=file.split_index)
+        file_copies.append(file_copy)
+
+    return file_copies
+
+
 @pytest.mark.after_training_ensemble_run
-def test_metrics_preparation_for_segmentation(test_config: PlotCrossValidationConfig) -> None:
+@pytest.mark.parametrize("drop_column", [None, CSV_INSTITUTION_HEADER, CSV_SERIES_HEADER])
+def test_metrics_preparation_for_segmentation(drop_column: Optional[str],
+                                              test_config: PlotCrossValidationConfig,
+                                              test_output_dirs: OutputFolderForTests) -> None:
     """
     Test if metrics dataframes can be loaded and prepared. The files in question are checked in, but
     were downloaded from a run, ID given in DEFAULT_ENSEMBLE_RUN_RECOVERY_ID.
+    Additionally test that CSV_INSTITUTION_HEADER or CSV_SERIES_HEADER can be dropped from the dataset_csv_file.
     """
     files = create_file_list_for_segmentation_recovery_run(test_config)
+    if drop_column:
+        def drop_csv_column(path: Path) -> None:
+            """
+            Load a csv file, drop a column, and save the csv file.
+            :param path: Path to csv file.
+            """
+            df = pd.read_csv(path)
+            dropped_df = df.drop(drop_column, axis=1)
+            dropped_df.to_csv(path)
+        files = copy_run_result_files(files, full_ml_test_data_path(), test_output_dirs.root_dir, drop_csv_column)
     downloaded_metrics = load_dataframes(files, test_config)
     assert test_config.run_recovery_id
     for mode in test_config.execution_modes_to_download():
         expected_df = _get_metrics_df(test_config.run_recovery_id, mode)
+        if drop_column:
+            # If dropped a column from dataset_csv_file, remove it from expected dataframe.
+            expected_df[drop_column] = ''
         # Drop the "mode" column, because that was added after creating the test data
         metrics = downloaded_metrics[mode]
         assert metrics is not None