diff --git a/CHANGELOG.md b/CHANGELOG.md index 723f5d135..af057d167 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ any large models anymore because data loaders ran out of memory. ### Added +- ([#488](https://github.com/microsoft/InnerEye-DeepLearning/pull/488)) Better handling of missing seriesId in segmentation cross validation reports. - ([#454](https://github.com/microsoft/InnerEye-DeepLearning/pull/454)) Checking that labels are mutually exclusive. - ([#447](https://github.com/microsoft/InnerEye-DeepLearning/pull/447/)) Added a sanity check to ensure there are no missing channels, nor missing files. If missing channels in the csv file or filenames associated with channels are diff --git a/InnerEye/ML/visualizers/plot_cross_validation.py b/InnerEye/ML/visualizers/plot_cross_validation.py index 320aa2817..dc2e35b61 100644 --- a/InnerEye/ML/visualizers/plot_cross_validation.py +++ b/InnerEye/ML/visualizers/plot_cross_validation.py @@ -552,6 +552,9 @@ def convert_rows_for_comparisons(split_column_value: Optional[str], :return: augmented subset of the rows in df, as described """ pre_len = len(df) + # If series id is not present, add a default value + if CSV_SERIES_HEADER not in dataset_df.columns: + dataset_df[CSV_SERIES_HEADER] = '' # We need the institution column to compare subjects across institutions, if it is not present with add a default # value if CSV_INSTITUTION_HEADER not in dataset_df.columns: diff --git a/Tests/ML/visualizers/test_plot_cross_validation.py b/Tests/ML/visualizers/test_plot_cross_validation.py index 42654ba7b..6bbf3ed4e 100644 --- a/Tests/ML/visualizers/test_plot_cross_validation.py +++ b/Tests/ML/visualizers/test_plot_cross_validation.py @@ -3,7 +3,8 @@ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ from pathlib import Path -from typing import Dict, List, Optional, Set, Tuple +import shutil +from typing import Callable, Dict, List, Optional, Set, Tuple import pandas as pd import pytest @@ -103,17 +104,72 @@ def create_file_list_for_segmentation_recovery_run(test_config_ensemble: PlotCro folder="main_1570466706163110") +def copy_run_result_files(files: List[RunResultFiles], src_prefix_path: Path, + dst_prefix_path: Path, transformer: Callable) -> List[RunResultFiles]: + """ + Copy dataset_csv_files from a list of RunResultFiles to a working directory, and then + transform them using a callback. + + :param files: List of RunResultFiles to copy. + :param src_prefix_path: Shared prefix path for the dataset_csv_files to be removed. + :param dst_prefix_path: Shared prefix path to use for the copied dataset_csv_files. + :param transformer: Callback function to apply to the copied dataset_csv_files. + :return: New list of RunResultFiles pointing at the copied files. + """ + file_copies = [] + files_copied = [] + + for file in files: + if not file.dataset_csv_file: + dataset_csv_file: Optional[Path] = None + else: + # Replace prefix path + dst_dataset_csv_file = dst_prefix_path / file.dataset_csv_file.relative_to(src_prefix_path) + if dst_dataset_csv_file not in files_copied: + dst_dataset_csv_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(file.dataset_csv_file, dst_dataset_csv_file) + files_copied.append(dst_dataset_csv_file) + transformer(dst_dataset_csv_file) + dataset_csv_file = dst_dataset_csv_file + + file_copy = RunResultFiles(execution_mode=file.execution_mode, + metrics_file=file.metrics_file, + dataset_csv_file=dataset_csv_file, + run_recovery_id=file.run_recovery_id, + split_index=file.split_index) + file_copies.append(file_copy) + + return file_copies + + @pytest.mark.after_training_ensemble_run -def test_metrics_preparation_for_segmentation(test_config: PlotCrossValidationConfig) -> None: +@pytest.mark.parametrize("drop_column", [None, CSV_INSTITUTION_HEADER, CSV_SERIES_HEADER]) +def test_metrics_preparation_for_segmentation(drop_column: Optional[str], + test_config: PlotCrossValidationConfig, + test_output_dirs: OutputFolderForTests) -> None: """ Test if metrics dataframes can be loaded and prepared. The files in question are checked in, but were downloaded from a run, ID given in DEFAULT_ENSEMBLE_RUN_RECOVERY_ID. + Additionally test that CSV_INSTITUTION_HEADER or CSV_SERIES_HEADER can be dropped from the dataset_csv_file. """ files = create_file_list_for_segmentation_recovery_run(test_config) + if drop_column: + def drop_csv_column(path: Path) -> None: + """ + Load a csv file, drop a column, and save the csv file. + :param path: Path to csv file. + """ + df = pd.read_csv(path) + dropped_df = df.drop(drop_column, axis=1) + dropped_df.to_csv(path) + files = copy_run_result_files(files, full_ml_test_data_path(), test_output_dirs.root_dir, drop_csv_column) downloaded_metrics = load_dataframes(files, test_config) assert test_config.run_recovery_id for mode in test_config.execution_modes_to_download(): expected_df = _get_metrics_df(test_config.run_recovery_id, mode) + if drop_column: + # If dropped a column from dataset_csv_file, remove it from expected dataframe. + expected_df[drop_column] = '' # Drop the "mode" column, because that was added after creating the test data metrics = downloaded_metrics[mode] assert metrics is not None