microsoft · ant0nsc · Sep 16, 2021 · Sep 17, 2021 · Sep 20, 2021 · Sep 20, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ created.
 ## Upcoming
 
 ### Added
+- ([#565](https://github.com/microsoft/InnerEye-DeepLearning/pull/565)) All `LightningContainer` models have two new commandline flags `pl_limit_train_batches` and `pl_limit_val_batches` to set the number of batches per epoch. Use this to speed up training (for example, when debugging)
 - ([#465](https://github.com/microsoft/InnerEye-DeepLearning/pull/465/)) Adding ability to run segmentation inference
 module on test data with partial ground truth files. (Also [522](https://github.com/microsoft/InnerEye-DeepLearning/pull/522).)
 - ([#502](https://github.com/microsoft/InnerEye-DeepLearning/pull/502)) More flags for fine control of when to run inference.
@@ -44,6 +45,7 @@ gets uploaded to AzureML, by skipping all test folders.
 - ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters
   in the config to run inference on both the validation and test sets by default.
 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
+- ([#565](https://github.com/microsoft/InnerEye-DeepLearning/pull/565)) The semantics of the SSL parameter `ssl_training_batch_size` changed from "effective batch size" (across all GPUs) to "batch size per GPU"
 
 ### Fixed
 - ([#537](https://github.com/microsoft/InnerEye-DeepLearning/pull/537)) Print warning if inference is disabled but comparison requested.
@@ -69,6 +71,7 @@ in inference-only runs when using lightning containers.
   correctly in the SimCLR module
 - ([#558](https://github.com/microsoft/InnerEye-DeepLearning/pull/558)) Fix issue with the CovidModel config where model
   weights from a finetuning run were incompatible with the model architecture created for non-finetuning runs.
+- ([#565](https://github.com/microsoft/InnerEye-DeepLearning/pull/565)) Checkpoints from SSL training now contain both optimizers, hence restarts after low priority preemption will correctly continue training of the linear head.
 
 ### Removed
 

diff --git a/InnerEye-DataQuality/InnerEyeDataQuality/deep_learning/self_supervised/simclr_module.py b/InnerEye-DataQuality/InnerEyeDataQuality/deep_learning/self_supervised/simclr_module.py
@@ -2,7 +2,6 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
-
 from typing import Any
 
 import torch

diff --git a/InnerEye/Common/Statistics/report_structure_extremes.py b/InnerEye/Common/Statistics/report_structure_extremes.py
@@ -140,7 +140,7 @@ def report_structure_extremes_for_subject(subj_dir: str, series_id: str) -> Iter
             yield line_for_structure(subject, series_prefix, base, data)
 
 
-def line_for_structure(subject: str, series_prefix: str, base: str, data: np.array) -> str:
+def line_for_structure(subject: str, series_prefix: str, base: str, data: np.ndarray) -> str:
     """
     :param subject: a subject, to include in the result
     :param series_prefix: first 8 characters (if any) of the series ID of the subject
@@ -169,7 +169,7 @@ def line_for_structure(subject: str, series_prefix: str, base: str, data: np.arr
     return line
 
 
-def extent_list(presence: np.array, max_value: int) -> Tuple[List[int], List[str]]:
+def extent_list(presence: np.ndarray, max_value: int) -> Tuple[List[int], List[str]]:
     """
     :param presence: a 1-D array of distinct integers in increasing order.
     :param max_value: any integer, not necessarily related to presence
@@ -186,7 +186,7 @@ def extent_list(presence: np.array, max_value: int) -> Tuple[List[int], List[str
     return result, missing_ranges
 
 
-def derive_missing_ranges(presence: np.array) -> List[str]:
+def derive_missing_ranges(presence: np.ndarray) -> List[str]:
     """
     :param presence: a 1-D array of distinct integers in increasing order.
     :return: a list of strings, each denoting a missing range of values within "presence".

diff --git a/InnerEye/Common/type_annotations.py b/InnerEye/Common/type_annotations.py
@@ -3,7 +3,7 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 from pathlib import Path
-from typing import Dict, Iterable, Optional, Tuple, TypeVar, Union
+from typing import Dict, Iterable, List, Optional, Tuple, TypeVar, Union
 
 T = TypeVar('T')
 PathOrString = Union[Path, str]
@@ -15,3 +15,4 @@
 TupleFloat9 = Tuple[float, float, float, float, float, float, float, float, float]
 IntOrTuple3 = Union[int, TupleInt3, Iterable]
 DictStrFloat = Dict[str, float]
+DictStrFloatOrFloatList = Dict[str, Union[float, List[float]]]
diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py
@@ -136,6 +136,9 @@ def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[SSLDataModuleType,
         """
         The train dataloaders
         """
+        # This code may be superseded in current versions of PL. Using this dictionary syntax will effectively
+        # use a CombinedLoader(dataloaders, mode="max_size_cycle"), similar to what we need to do explicitly for
+        # the validation data loader.
         dataloaders = {
             SSLDataModuleType.ENCODER: self.encoder_module.train_dataloader(),
             SSLDataModuleType.LINEAR_HEAD: self.linear_head_module.train_dataloader()}

diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_container.py b/InnerEye/ML/SSL/lightning_containers/ssl_container.py
@@ -75,9 +75,9 @@ class SSLContainer(LightningContainer):
                                                       "augmentations. Ignored for CIFAR10 example")
     ssl_training_dataset_name = param.ClassSelector(class_=SSLDatasetName, doc="The name of the dataset")
     ssl_training_batch_size = param.Integer(
-        doc="Total training batch size, will be divided across the number of gpus used for training. For example: if "
-            "you specify ssl_training_batch_size=1600 and use 4 nodes with 4 gpus each (i.e. total of 16 GPUs), "
-            "the code will provide a per-gpu batch size of 100")
+        doc="Training batch size per GPU. The effective batch size will be the number of GPUs times this number. "
+            "For example, if you specify ssl_training_batch_size=100 and use 4 nodes with 4 gpus each, "
+            "the effective batch size will be 1600.")
     ssl_training_type = param.ClassSelector(class_=SSLTrainingType, doc="Which algorithm to use for SSL training")
     ssl_encoder = param.ClassSelector(class_=EncoderName, doc="Which encoder to use for SSL")
     use_balanced_binary_loss_for_linear_head = param.Boolean(default=False,
@@ -100,6 +100,9 @@ class SSLContainer(LightningContainer):
 
     def setup(self) -> None:
         from InnerEye.ML.SSL.lightning_containers.ssl_image_classifier import SSLClassifierContainer
+        if self.is_debug_model:
+            self.pl_limit_train_batches = 1
+            self.pl_limit_val_batches = 1
         self.total_num_gpus = self.num_gpus_per_node * self.num_nodes
         self._load_config()
         # If you're using the same data for training and linear head, allow the user to specify the dataset only
@@ -169,6 +172,13 @@ def create_model(self) -> LightningModule:
                               "num_classes": self.data_module.num_classes})
 
         self.encoder_output_dim = get_encoder_output_dim(model, self.data_module)
+        self.online_eval_callback = \
+            SSLOnlineEvaluatorInnerEye(class_weights=self.data_module.class_weights,  # type: ignore
+                                       z_dim=self.encoder_output_dim,
+                                       num_classes=self.data_module.num_classes,  # type: ignore
+                                       dataset=self.linear_head_dataset_name.value,  # type: ignore
+                                       drop_p=0.2,
+                                       learning_rate=self.learning_rate_linear_head_during_ssl_training)
         return model
 
     def get_data_module(self) -> InnerEyeDataModuleTypes:
@@ -199,16 +209,17 @@ def _create_ssl_data_modules(self, is_ssl_encoder_module: bool) -> InnerEyeVisio
         train_transforms, val_transforms = self._get_transforms(datamodule_args.augmentation_params,
                                                                 datamodule_args.dataset_name,
                                                                 is_ssl_encoder_module)
-        batch_size_per_gpu = datamodule_args.batch_size // self.total_num_gpus if self.total_num_gpus > 0 else \
-            datamodule_args.batch_size
-        logging.info(f"Batch size per gpu: {batch_size_per_gpu}")
+        batch_multiplier = self.total_num_gpus if self.total_num_gpus > 0 else 1
+        effective_batch_size = datamodule_args.batch_size * batch_multiplier
+        logging.info(f"Batch size per GPU: {datamodule_args.batch_size}")
+        logging.info(f"Effective batch size on {batch_multiplier} GPUs: {effective_batch_size}")
         dm = InnerEyeVisionDataModule(dataset_cls=self._SSLDataClassMappings[datamodule_args.dataset_name],
                                       return_index=not is_ssl_encoder_module,  # index is only needed for linear head
                                       train_transforms=train_transforms,
                                       val_split=0.1,
                                       val_transforms=val_transforms,
                                       data_dir=str(datamodule_args.dataset_path),
-                                      batch_size=batch_size_per_gpu,
+                                      batch_size=datamodule_args.batch_size,
                                       num_workers=self.num_workers,
                                       seed=self.random_seed,
                                       drop_last=self.drop_last)
@@ -226,9 +237,9 @@ def _get_transforms(self, augmentation_config: Optional[CfgNode],
         :param dataset_name: name of the dataset, value has to be in SSLDatasetName, determines which transformation
         pipeline to return.
         :param is_ssl_encoder_module: if True the transformation pipeline will yield two versions of the image it is
-        applied on and it applies the training transformations also at validation time. Note that if your transformation 
-        does not contain any randomness, the pipeline will return two identical copies. If False, it will return only one 
-        transformation.
+        applied on and it applies the training transformations also at validation time. Note that if your transformation
+        does not contain any randomness, the pipeline will return two identical copies. If False, it will return only
+        one transformation.
         :return: training transformation pipeline and validation transformation pipeline.
         """
         if dataset_name in [SSLDatasetName.RSNAKaggleCXR.value,
@@ -262,13 +273,5 @@ def _get_transforms(self, augmentation_config: Optional[CfgNode],
         return train_transforms, val_transforms
 
     def get_trainer_arguments(self) -> Dict[str, Any]:
-        self.online_eval = SSLOnlineEvaluatorInnerEye(class_weights=self.data_module.class_weights,  # type: ignore
-                                                      z_dim=self.encoder_output_dim,
-                                                      num_classes=self.data_module.num_classes,  # type: ignore
-                                                      dataset=self.linear_head_dataset_name.value,  # type: ignore
-                                                      drop_p=0.2,
-                                                      learning_rate=self.learning_rate_linear_head_during_ssl_training)
-        trainer_kwargs: Dict[str, Any] = {"callbacks": self.online_eval}
-        if self.is_debug_model:
-            trainer_kwargs.update({"limit_train_batches": 1, "limit_val_batches": 1})
+        trainer_kwargs: Dict[str, Any] = {"callbacks": self.online_eval_callback}
         return trainer_kwargs
diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_image_classifier.py b/InnerEye/ML/SSL/lightning_containers/ssl_image_classifier.py
@@ -64,7 +64,4 @@ def get_data_module(self) -> InnerEyeDataModuleTypes:
         return self.data_module
 
     def get_trainer_arguments(self) -> Dict[str, Any]:
-        trained_kwargs = {}
-        if self.is_debug_model:
-            trained_kwargs.update({"limit_train_batches": 1, "limit_val_batches": 1})
-        return trained_kwargs
+        return {}
diff --git a/InnerEye/ML/SSL/lightning_modules/byol/byol_module.py b/InnerEye/ML/SSL/lightning_modules/byol/byol_module.py
@@ -10,13 +10,14 @@
 import torch
 import torch.nn.functional as F
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
+from pytorch_lightning import Trainer
 from torch import Tensor as T
 from torch.optim import Adam
 
 from InnerEye.ML.SSL.lightning_modules.byol.byol_models import SiameseArm
 from InnerEye.ML.SSL.lightning_modules.byol.byol_moving_average import ByolMovingAverageWeightUpdate
 from InnerEye.ML.SSL.utils import SSLDataModuleType
-from pytorch_lightning import Trainer
+from InnerEye.ML.lightning_loggers import log_learning_rate, log_on_epoch
 
 SingleBatchType = Tuple[List, T]
 BatchType = Union[Dict[SSLDataModuleType, SingleBatchType], SingleBatchType]
@@ -98,14 +99,15 @@ def shared_step(self, batch: BatchType, batch_idx: int) -> T:
 
         return loss
 
-    def training_step(self, batch: BatchType, batch_idx: int, **kwargs: Any) -> T:  # type: ignore
+    def training_step(self, batch: BatchType, batch_idx: int, **kwargs: Any) -> torch.Tensor:  # type: ignore
         loss = self.shared_step(batch, batch_idx)
-        self.log_dict({'byol/train/loss': loss, 'byol/tau': self.weight_callback.current_tau})
+        log_on_epoch(self, metrics={'byol/train/loss': loss, 'byol/tau': self.weight_callback.current_tau})
+        log_learning_rate(self, name="byol/learning_rate")
         return loss
 
     def validation_step(self, batch: BatchType, batch_idx: int, **kwargs: Any) -> T:  # type: ignore
         loss = self.shared_step(batch, batch_idx)
-        self.log_dict({'byol/val/loss': loss})
+        log_on_epoch(self, 'byol/val/loss', loss)
         return loss
 
     def setup(self, *args: Any, **kwargs: Any) -> None:
@@ -116,9 +118,10 @@ def configure_optimizers(self) -> Any:
         # exclude certain parameters
         parameters = self.exclude_from_wt_decay(self.online_network.named_parameters(),
                                                 weight_decay=self.hparams.weight_decay)  # type: ignore
-        optimizer = Adam(parameters, lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay)  # type: ignore
+        optimizer = Adam(parameters, lr=self.hparams.learning_rate,  # type: ignore
+                         weight_decay=self.hparams.weight_decay)  # type: ignore
         scheduler = LinearWarmupCosineAnnealingLR(
-            optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs)   # type: ignore
+            optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs)  # type: ignore
         return [optimizer], [scheduler]
 
     def exclude_from_wt_decay(self,
@@ -144,4 +147,3 @@ def exclude_from_wt_decay(self,
             {'params': params, 'weight_decay': weight_decay},
             {'params': excluded_params, 'weight_decay': 0.}
         ]
-
diff --git a/InnerEye/ML/SSL/lightning_modules/simclr_module.py b/InnerEye/ML/SSL/lightning_modules/simclr_module.py
@@ -2,7 +2,6 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
-
 from typing import Any, Dict, List, Tuple, Union
 
 import torch
@@ -13,6 +12,7 @@
 
 from InnerEye.ML.SSL.encoders import SSLEncoder
 from InnerEye.ML.SSL.utils import SSLDataModuleType
+from InnerEye.ML.lightning_loggers import log_learning_rate, log_on_epoch
 
 SingleBatchType = Tuple[List, T]
 BatchType = Union[Dict[SSLDataModuleType, SingleBatchType], SingleBatchType]
@@ -57,6 +57,17 @@ def __init__(self, encoder_name: str, dataset_name: str, use_7x7_first_conv_in_r
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.encoder(x)
 
+    def training_step(self, batch: BatchType, batch_idx: int) -> torch.Tensor:
+        loss = self.shared_step(batch)
+        log_on_epoch(self, "simclr/train/loss", loss, sync_dist=False)
+        log_learning_rate(self, name="simclr/learning_rate")
+        return loss
+
+    def validation_step(self, batch: BatchType, batch_idx: int) -> T:  # type: ignore
+        loss = self.shared_step(batch)
+        log_on_epoch(self, "simclr/val/loss", loss, sync_dist=False)
+        return loss
+
     def shared_step(self, batch: BatchType) -> T:
         batch = batch[SSLDataModuleType.ENCODER] if isinstance(batch, dict) else batch
 
@@ -72,6 +83,3 @@ def shared_step(self, batch: BatchType) -> T:
         loss = self.nt_xent_loss(z1, z2, self.temperature)
 
         return loss
-
-
-
diff --git a/InnerEye/ML/SSL/lightning_modules/ssl_classifier_module.py b/InnerEye/ML/SSL/lightning_modules/ssl_classifier_module.py
@@ -5,11 +5,12 @@
 from typing import Any, List, Optional
 
 import torch
-from torchmetrics import Metric
 from pl_bolts.models.self_supervised import SSLEvaluator
 from torch.nn import functional as F
+from torchmetrics import Metric
 
 from InnerEye.ML.SSL.encoders import get_encoder_output_dim
+from InnerEye.ML.lightning_loggers import log_on_epoch
 from InnerEye.ML.dataset.scalar_sample import ScalarItem
 from InnerEye.ML.lightning_container import LightningModuleWithOptimizer
 from InnerEye.ML.lightning_metrics import Accuracy05, AreaUnderPrecisionRecallCurve, AreaUnderRocCurve
@@ -79,16 +80,16 @@ def shared_step(self, batch: Any, is_training: bool) -> Any:
 
     def training_step(self, batch: Any, batch_id: int, *args: Any, **kwargs: Any) -> Any:  # type: ignore
         loss = self.shared_step(batch, True)
-        self.log("train/loss", loss, on_step=False, on_epoch=True)
+        log_on_epoch(self, "train/loss", loss)
         for metric in self.train_metrics:
-            self.log(f"train/{metric.name}", metric, on_epoch=True, on_step=False)
+            log_on_epoch(self, f"train/{metric.name}", metric)
         return loss
 
     def validation_step(self, batch: Any, batch_id: int, *args: Any, **kwargs: Any) -> None:  # type: ignore
         loss = self.shared_step(batch, is_training=False)
-        self.log('val/loss', loss, on_step=False, on_epoch=True, sync_dist=True)
+        log_on_epoch(self, 'val/loss', loss)
         for metric in self.val_metrics:
-            self.log(f"val/{metric.name}", metric, on_epoch=True, on_step=False)
+            log_on_epoch(self, f"val/{metric.name}", metric)
 
     def get_input_tensors(self, item: ScalarItem) -> List[torch.Tensor]:
         """