microsoft · ant0nsc · Jan 17, 2022 · Jan 12, 2022 · Jan 12, 2022 · Jan 12, 2022
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -72,6 +72,8 @@ gets uploaded to AzureML, by skipping all test folders.
 - ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.
 - ([#615](https://github.com/microsoft/InnerEye-DeepLearning/pull/615)) Minor changes to checkpoint download from AzureML.
 - ([#605](https://github.com/microsoft/InnerEye-DeepLearning/pull/605)) Make build jobs deterministic for regression testing.
+- ([#633](https://github.com/microsoft/InnerEye-DeepLearning/pull/633)) Model training now only writes one recovery checkpoint, rather than multiple ones. Frequency is controlled by
+  `autosave_every_n_val_epochs`.
 - ([#632](https://github.com/microsoft/InnerEye-DeepLearning/pull/632)) Nifti test data is no longer stored in Git LFS
 
 ### Fixed
@@ -125,6 +127,9 @@ in inference-only runs when using lightning containers.
 
 ### Deprecated
 
+- ([#633](https://github.com/microsoft/InnerEye-DeepLearning/pull/633)) Model fields `recovery_checkpoint_save_interval` and `recovery_checkpoints_save_last_k` have been retired. 
+  Recovery checkpoint handling is now controlled by `autosave_every_n_val_epochs`.
+
 
 ## 0.3 (2021-06-01)
 

diff --git a/InnerEye/Azure/azure_util.py b/InnerEye/Azure/azure_util.py
@@ -8,12 +8,12 @@
 from pathlib import Path
 from typing import Generator, List, Optional, Tuple
 
-from azureml.core import Experiment, Run, Workspace, get_run
+from azureml.core import Experiment, Run, Workspace
 from azureml.exceptions import UserErrorException
 
 from InnerEye.Common import fixed_paths
 from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
-from health_azure.utils import create_run_recovery_id
+from health_azure.utils import create_run_recovery_id, get_aml_run_from_run_id
 
 DEFAULT_CROSS_VALIDATION_SPLIT_INDEX = -1
 EXPERIMENT_RUN_SEPARATOR = ":"
@@ -79,40 +79,7 @@ def fetch_run(workspace: Workspace, run_recovery_id: str) -> Run:
     or just the run_id
     :return: The AzureML run.
     """
-    experiment, run = split_recovery_id(run_recovery_id)
-    try:
-        experiment_to_recover = Experiment(workspace, experiment)
-    except Exception as ex:
-        raise Exception(
-            f"Unable to retrieve run {run} in experiment {experiment}: {str(ex)}"
-        )
-    run_to_recover = fetch_run_for_experiment(experiment_to_recover, run)
-    logging.info(
-        "Fetched run #{} {} from experiment {}.".format(
-            run, run_to_recover.number, experiment
-        )
-    )
-    return run_to_recover
-
-
-def fetch_run_for_experiment(experiment_to_recover: Experiment, run_id: str) -> Run:
-    """
-    :param experiment_to_recover: an experiment
-    :param run_id: a string representing the Run ID of one of the runs of the experiment
-    :return: the run matching run_id_or_number; raises an exception if not found
-    """
-    try:
-        return get_run(experiment=experiment_to_recover, run_id=run_id, rehydrate=True)
-    except Exception:
-        available_runs = experiment_to_recover.get_runs()
-        available_ids = ", ".join([run.id for run in available_runs])
-        raise (
-            Exception(
-                "Run {} not found for experiment: {}. Available runs are: {}".format(
-                    run_id, experiment_to_recover.name, available_ids
-                )
-            )
-        )
+    return get_aml_run_from_run_id(aml_workspace=workspace, run_id=run_recovery_id)
 
 
 def fetch_runs(experiment: Experiment, filters: List[str]) -> List[Run]:
@@ -133,9 +100,9 @@ def fetch_runs(experiment: Experiment, filters: List[str]) -> List[Run]:
 
 
 def fetch_child_runs(
-    run: Run,
-    status: Optional[str] = None,
-    expected_number_cross_validation_splits: int = 0,
+        run: Run,
+        status: Optional[str] = None,
+        expected_number_cross_validation_splits: int = 0,
 ) -> List[Run]:
     """
     Fetch child runs for the provided runs that have the provided AML status (or fetch all by default)
@@ -312,7 +279,7 @@ def download_run_output_file(blob_path: Path, destination: Path, run: Run) -> Pa
 
 
 def download_run_outputs_by_prefix(
-    blobs_prefix: Path, destination: Path, run: Run
+        blobs_prefix: Path, destination: Path, run: Run
 ) -> None:
     """
     Download all the blobs from the run's default output directory: DEFAULT_AML_UPLOAD_DIR ("outputs") that
@@ -354,7 +321,7 @@ def is_running_on_azure_agent() -> bool:
 
 
 def get_comparison_baseline_paths(
-    outputs_folder: Path, blob_path: Path, run: Run, dataset_csv_file_name: str
+        outputs_folder: Path, blob_path: Path, run: Run, dataset_csv_file_name: str
 ) -> Tuple[Optional[Path], Optional[Path]]:
     run_rec_id = run.id
     # We usually find dataset.csv in the same directory as metrics.csv, but we sometimes

diff --git a/InnerEye/ML/common.py b/InnerEye/ML/common.py
@@ -13,11 +13,15 @@
 DATASET_CSV_FILE_NAME = "dataset.csv"
 CHECKPOINT_SUFFIX = ".ckpt"
 
-RECOVERY_CHECKPOINT_FILE_NAME = "recovery"
-RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX = RECOVERY_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX
+# The file names for the legacy "recovery" checkpoints behaviour, which stored the most recent N checkpoints
+LEGACY_RECOVERY_CHECKPOINT_FILE_NAME = "recovery"
 
-BEST_CHECKPOINT_FILE_NAME = "best_checkpoint"
-BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX = BEST_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX
+# The file names for the new recovery checkpoint behaviour: A single fixed checkpoint that is written every N epochs.
+# Lightning does not overwrite files in place, and will hence create files "autosave.ckpt", "autosave-v1.ckpt"
+# alternatingly
+AUTOSAVE_CHECKPOINT_FILE_NAME = "autosave"
+AUTOSAVE_CHECKPOINT_CANDIDATES = [AUTOSAVE_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX,
+                                  AUTOSAVE_CHECKPOINT_FILE_NAME + "-v1" + CHECKPOINT_SUFFIX]
 
 # This is a constant that must match a filename defined in pytorch_lightning.ModelCheckpoint, but we don't want
 # to import that here.
@@ -84,4 +88,4 @@ def get_best_checkpoint_path(path: Path) -> Path:
     Given a path and checkpoint, formats a path based on the checkpoint file name format.
     :param path to checkpoint folder
     """
-    return path / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    return path / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
diff --git a/InnerEye/ML/configs/histo_configs/classification/DeepSMILECrck.py b/InnerEye/ML/configs/histo_configs/classification/DeepSMILECrck.py
@@ -56,8 +56,6 @@ def __init__(self, **kwargs: Any) -> None:
             # To mount the dataset instead of downloading in AML, pass --use_dataset_mount in the CLI
             # declared in TrainerParams:
             num_epochs=16,
-            recovery_checkpoint_save_interval=16,
-            recovery_checkpoints_save_last_k=-1,
             # declared in WorkflowParams:
             number_of_cross_validation_splits=5,
             cross_validation_split_index=0,

diff --git a/InnerEye/ML/configs/histo_configs/classification/DeepSMILEPanda.py b/InnerEye/ML/configs/histo_configs/classification/DeepSMILEPanda.py
@@ -41,8 +41,6 @@ def __init__(self, **kwargs: Any) -> None:
             # To mount the dataset instead of downloading in AML, pass --use_dataset_mount in the CLI
             # declared in TrainerParams:
             num_epochs=200,
-            recovery_checkpoint_save_interval=10,
-            recovery_checkpoints_save_last_k=-1,
             # use_mixed_precision = True,
             # declared in WorkflowParams:
             number_of_cross_validation_splits=5,

diff --git a/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py b/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
@@ -7,7 +7,7 @@
 import pandas as pd
 
 from InnerEye.ML.config import PhotometricNormalizationMethod, SegmentationModelBase, equally_weighted_classes
-from InnerEye.ML.configs.segmentation.Lung import AZURE_DATASET_ID
+from InnerEye.ML.configs.segmentation.Lung import LUNG_AZURE_DATASET_ID
 from InnerEye.ML.deep_learning_config import LRSchedulerType
 from InnerEye.ML.utils.split_dataset import DatasetSplits
 
@@ -40,9 +40,8 @@ def __init__(self, **kwargs: Any) -> None:
             num_dataload_workers=1,
             train_batch_size=8,
             num_epochs=2,
-            recovery_checkpoint_save_interval=1,
             use_mixed_precision=True,
-            azure_dataset_id=AZURE_DATASET_ID,
+            azure_dataset_id=LUNG_AZURE_DATASET_ID,
             comparison_blob_storage_paths=comparison_blob_storage_paths,
             inference_on_val_set=True,
             inference_on_test_set=True,

diff --git a/InnerEye/ML/configs/segmentation/GbmBase.py b/InnerEye/ML/configs/segmentation/GbmBase.py
@@ -50,7 +50,6 @@ def __init__(self, **kwargs: Any) -> None:
             adam_betas=(0.9, 0.999),
             momentum=0.9,
             weight_decay=1e-4,
-            recovery_checkpoint_save_interval=10,
             use_mixed_precision=True,
             use_model_parallel=True,
         )

diff --git a/InnerEye/ML/configs/segmentation/HeadAndNeckBase.py b/InnerEye/ML/configs/segmentation/HeadAndNeckBase.py
@@ -86,7 +86,6 @@ def __init__(self,
         super().__init__(
             should_validate=False,  # we'll validate after kwargs are added
             num_epochs=num_epochs,
-            recovery_checkpoint_save_interval=10,
             architecture="UNet3D",
             kernel_size=3,
             train_batch_size=1,

diff --git a/InnerEye/ML/configs/segmentation/HelloWorld.py b/InnerEye/ML/configs/segmentation/HelloWorld.py
@@ -59,7 +59,6 @@ def __init__(self, **kwargs: Any) -> None:
             num_dataload_workers=0,
             train_batch_size=2,
             num_epochs=2,
-            recovery_checkpoint_save_interval=1,
             use_mixed_precision=True,
 
             # Pre-processing - in this section we define how to normalize our inputs, in this case we are doing

diff --git a/InnerEye/ML/configs/segmentation/Lung.py b/InnerEye/ML/configs/segmentation/Lung.py
@@ -13,7 +13,7 @@
 from InnerEye.ML.utils.split_dataset import DatasetSplits
 
 # Change this string to the name of your dataset on Azure blob storage.
-AZURE_DATASET_ID = "2339eba2-8ec5-4ccb-86ff-c170470ac6e2_geonorm_with_train_test_split_2020_05_26"
+LUNG_AZURE_DATASET_ID = "2339eba2-8ec5-4ccb-86ff-c170470ac6e2_geonorm_with_train_test_split_2020_05_26"
 
 
 class Lung(SegmentationModelBase):
@@ -29,7 +29,7 @@ def __init__(self, **kwargs: Any) -> None:
             architecture="UNet3D",
             feature_channels=[32],
             kernel_size=3,
-            azure_dataset_id=AZURE_DATASET_ID,
+            azure_dataset_id=LUNG_AZURE_DATASET_ID,
             crop_size=(64, 224, 224),
             test_crop_size=(128, 512, 512),
             image_channels=["ct"],
@@ -56,7 +56,6 @@ def __init__(self, **kwargs: Any) -> None:
             adam_betas=(0.9, 0.999),
             momentum=0.9,
             weight_decay=1e-4,
-            recovery_checkpoint_save_interval=10,
             use_mixed_precision=True,
             use_model_parallel=True,
             monitoring_interval_seconds=0,

diff --git a/InnerEye/ML/configs/ssl/CIFAR_SSL_configs.py b/InnerEye/ML/configs/ssl/CIFAR_SSL_configs.py
@@ -20,7 +20,6 @@ def __init__(self) -> None:
                          ssl_encoder=EncoderName.resnet50,
                          ssl_training_type=SSLTrainingType.SimCLR,
                          random_seed=1,
-                         recovery_checkpoint_save_interval=200,
                          num_epochs=2500,
                          num_workers=6)
 
@@ -38,7 +37,6 @@ def __init__(self) -> None:
                          ssl_encoder=EncoderName.resnet50,
                          ssl_training_type=SSLTrainingType.BYOL,
                          random_seed=1,
-                         recovery_checkpoint_save_interval=200,
                          num_epochs=2500,
                          num_workers=6)
 
@@ -55,6 +53,5 @@ def __init__(self) -> None:
                          ssl_encoder=EncoderName.resnet50,
                          ssl_training_type=SSLTrainingType.BYOL,
                          random_seed=1,
-                         recovery_checkpoint_save_interval=200,
                          num_epochs=2500,
                          num_workers=6)
diff --git a/InnerEye/ML/configs/ssl/CIFAR_classifier_configs.py b/InnerEye/ML/configs/ssl/CIFAR_classifier_configs.py
@@ -11,7 +11,6 @@ def __init__(self) -> None:
         super().__init__(
             linear_head_dataset_name=SSLDatasetName.CIFAR10,
             random_seed=1,
-            recovery_checkpoint_save_interval=5,
             num_epochs=100,
             l_rate=1e-4,
             num_workers=6)
diff --git a/InnerEye/ML/configs/ssl/CXR_SSL_configs.py b/InnerEye/ML/configs/ssl/CXR_SSL_configs.py
@@ -27,7 +27,6 @@ def __init__(self) -> None:
                          linear_head_dataset_name=SSLDatasetName.RSNAKaggleCXR,
                          azure_dataset_id=NIH_AZURE_DATASET_ID,
                          random_seed=1,
-                         recovery_checkpoint_save_interval=200,
                          num_epochs=1000,
                          # We usually train this model with 16 GPUs, giving an effective batch size of 1200
                          ssl_training_batch_size=75,
@@ -44,7 +43,6 @@ def __init__(self) -> None:
                          linear_head_dataset_name=SSLDatasetName.RSNAKaggleCXR,
                          azure_dataset_id=NIH_AZURE_DATASET_ID,
                          random_seed=1,
-                         recovery_checkpoint_save_interval=200,
                          num_epochs=1000,
                          # We usually train this model with 16 GPUs, giving an effective batch size of 1200
                          ssl_training_batch_size=75,
@@ -60,7 +58,6 @@ class CXRImageClassifier(SSLClassifierContainer):
     def __init__(self) -> None:
         super().__init__(linear_head_dataset_name=SSLDatasetName.RSNAKaggleCXR,
                          random_seed=1,
-                         recovery_checkpoint_save_interval=10,
                          num_epochs=200,
                          use_balanced_binary_loss_for_linear_head=True,
                          azure_dataset_id=RSNA_AZURE_DATASET_ID,

diff --git a/InnerEye/ML/configs/ssl/CovidContainers.py b/InnerEye/ML/configs/ssl/CovidContainers.py
@@ -20,8 +20,6 @@ def __init__(self,
         super().__init__(ssl_training_dataset_name=SSLDatasetName.NIHCXR,
                          linear_head_dataset_name=SSLDatasetName.Covid,
                          random_seed=1,
-                         recovery_checkpoint_save_interval=50,
-                         recovery_checkpoints_save_last_k=3,
                          num_epochs=500,
                          ssl_training_batch_size=75,  # This runs  with 16 gpus (4 nodes)
                          num_workers=12,

diff --git a/InnerEye/ML/configs/unit_testing/passthrough_model.py b/InnerEye/ML/configs/unit_testing/passthrough_model.py
@@ -12,7 +12,7 @@
 
 from InnerEye.Common.type_annotations import TupleInt3
 from InnerEye.ML.config import ModelArchitectureConfig, SegmentationModelBase, equally_weighted_classes
-from InnerEye.ML.configs.segmentation.Lung import AZURE_DATASET_ID
+from InnerEye.ML.configs.segmentation.Lung import LUNG_AZURE_DATASET_ID
 from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
 from InnerEye.ML.models.parallel.model_parallel import get_device_from_parameters, move_to_device
 from InnerEye.ML.utils.model_metadata_util import generate_random_colours_list
@@ -34,7 +34,7 @@ def __init__(self, **kwargs: Any) -> None:
             should_validate=False,
             # Set as UNet3D only because this does not shrink patches in the forward pass.
             architecture=ModelArchitectureConfig.UNet3D,
-            azure_dataset_id=AZURE_DATASET_ID,
+            azure_dataset_id=LUNG_AZURE_DATASET_ID,
             crop_size=(64, 224, 224),
             num_dataload_workers=1,
             # Disable monitoring so that we can use VS Code remote debugging

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -475,7 +475,6 @@ def logs_folder(self) -> Path:
     @property
     def checkpoint_folder(self) -> Path:
         """Gets the full path in which the model checkpoints should be stored during training."""
-        print(f"Expected Checkpoint path {self.outputs_folder / CHECKPOINT_FOLDER}")
         return self.outputs_folder / CHECKPOINT_FOLDER
 
     @property
@@ -567,15 +566,11 @@ def min_l_rate(self, value: float) -> None:
 
 class TrainerParams(param.Parameterized):
     num_epochs: int = param.Integer(100, bounds=(1, None), doc="Number of epochs to train.")
-    recovery_checkpoint_save_interval: int = param.Integer(10, bounds=(0, None),
-                                                           doc="Save epoch checkpoints when epoch number is a multiple "
-                                                               "of recovery_checkpoint_save_interval. The intended use "
-                                                               "is to allow restore training from failed runs.")
-    recovery_checkpoints_save_last_k: int = param.Integer(default=1, bounds=(-1, None),
-                                                          doc="Number of recovery checkpoints to keep. Recovery "
-                                                              "checkpoints will be stored as recovery_epoch:{"
-                                                              "epoch}.ckpt. If set to -1 keep all recovery "
-                                                              "checkpoints.")
+    autosave_every_n_val_epochs: int = param.Integer(1, bounds=(0, None),
+                                                     doc="Save epoch checkpoints every N validation epochs. "
+                                                         "If pl_check_val_every_n_epoch > 1, this means that "
+                                                         "checkpoints are saved every N * pl_check_val_every_n_epoch "
+                                                         "training epochs.")
     detect_anomaly: bool = param.Boolean(False, doc="If true, test gradients for anomalies (NaN or Inf) during "
                                                     "training.")
     use_mixed_precision: bool = param.Boolean(False, doc="If true, mixed precision training is activated during "