microsoft · ant0nsc · Nov 18, 2021 · Sep 22, 2021 · Oct 13, 2021 · Oct 13, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -51,6 +51,7 @@ gets uploaded to AzureML, by skipping all test folders.
   `ScalarModelBase`.
 - ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters
   in the config to run inference on both the validation and test sets by default.
+- ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
 - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
 

diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py
@@ -136,6 +136,9 @@ def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[SSLDataModuleType,
         """
         The train dataloaders
         """
+        # This code may be superseded in current versions of PL. Using this dictionary syntax will effectively
+        # use a CombinedLoader(dataloaders, mode="max_size_cycle"), similar to what we need to do explicitly for
+        # the validation data loader.
         dataloaders = {
             SSLDataModuleType.ENCODER: self.encoder_module.train_dataloader(),
             SSLDataModuleType.LINEAR_HEAD: self.linear_head_module.train_dataloader()}

diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_container.py b/InnerEye/ML/SSL/lightning_containers/ssl_container.py
@@ -75,9 +75,9 @@ class SSLContainer(LightningContainer):
                                                       "augmentations. Ignored for CIFAR10 example")
     ssl_training_dataset_name = param.ClassSelector(class_=SSLDatasetName, doc="The name of the dataset")
     ssl_training_batch_size = param.Integer(
-        doc="Total training batch size, will be divided across the number of gpus used for training. For example: if "
-            "you specify ssl_training_batch_size=1600 and use 4 nodes with 4 gpus each (i.e. total of 16 GPUs), "
-            "the code will provide a per-gpu batch size of 100")
+        doc="Training batch size per GPU. The effective batch size will be the number of GPUs times this number. "
+            "For example, if you specify ssl_training_batch_size=100 and use 4 nodes with 4 gpus each, "
+            "the effective batch size will be 1600.")
     ssl_training_type = param.ClassSelector(class_=SSLTrainingType, doc="Which algorithm to use for SSL training")
     ssl_encoder = param.ClassSelector(class_=EncoderName, doc="Which encoder to use for SSL")
     use_balanced_binary_loss_for_linear_head = param.Boolean(default=False,
@@ -92,14 +92,18 @@ class SSLContainer(LightningContainer):
                                                               "augmentations")
     linear_head_dataset_name = param.ClassSelector(class_=SSLDatasetName,
                                                    doc="Name of the dataset to use for the linear head training")
-    linear_head_batch_size = param.Integer(default=256, doc="Batch size for linear head tuning")
+    linear_head_batch_size = param.Integer(default=16, doc="Batch size for linear head tuning")
     learning_rate_linear_head_during_ssl_training = param.Number(default=1e-4,
                                                                  doc="Learning rate for linear head training during "
                                                                      "SSL training.")
     drop_last = param.Boolean(default=True, doc="If True drops the last incomplete batch")
 
     def setup(self) -> None:
         from InnerEye.ML.SSL.lightning_containers.ssl_image_classifier import SSLClassifierContainer
+        if self.is_debug_model:
+            self.pl_limit_train_batches = 1
+            self.pl_limit_val_batches = 1
+        self.pl_find_unused_parameters = True
         self.total_num_gpus = self.num_gpus_per_node() * self.num_nodes
         self._load_config()
         # If you're using the same data for training and linear head, allow the user to specify the dataset only
@@ -199,16 +203,17 @@ def _create_ssl_data_modules(self, is_ssl_encoder_module: bool) -> InnerEyeVisio
         train_transforms, val_transforms = self._get_transforms(datamodule_args.augmentation_params,
                                                                 datamodule_args.dataset_name,
                                                                 is_ssl_encoder_module)
-        batch_size_per_gpu = datamodule_args.batch_size // self.total_num_gpus if self.total_num_gpus > 0 else \
-            datamodule_args.batch_size
-        logging.info(f"Batch size per gpu: {batch_size_per_gpu}")
+        batch_multiplier = self.total_num_gpus if self.total_num_gpus > 0 else 1
+        effective_batch_size = datamodule_args.batch_size * batch_multiplier
+        logging.info(f"Batch size per GPU: {datamodule_args.batch_size}")
+        logging.info(f"Effective batch size on {batch_multiplier} GPUs: {effective_batch_size}")
         dm = InnerEyeVisionDataModule(dataset_cls=self._SSLDataClassMappings[datamodule_args.dataset_name],
                                       return_index=not is_ssl_encoder_module,  # index is only needed for linear head
                                       train_transforms=train_transforms,
                                       val_split=0.1,
                                       val_transforms=val_transforms,
                                       data_dir=str(datamodule_args.dataset_path),
-                                      batch_size=batch_size_per_gpu,
+                                      batch_size=datamodule_args.batch_size,
                                       num_workers=self.num_workers,
                                       seed=self.random_seed,
                                       drop_last=self.drop_last)
@@ -226,9 +231,9 @@ def _get_transforms(self, augmentation_config: Optional[CfgNode],
         :param dataset_name: name of the dataset, value has to be in SSLDatasetName, determines which transformation
         pipeline to return.
         :param is_ssl_encoder_module: if True the transformation pipeline will yield two versions of the image it is
-        applied on and it applies the training transformations also at validation time. Note that if your transformation 
-        does not contain any randomness, the pipeline will return two identical copies. If False, it will return only one 
-        transformation.
+        applied on and it applies the training transformations also at validation time. Note that if your
+        transformation does not contain any randomness, the pipeline will return two identical copies. If False, it
+        will return only one transformation.
         :return: training transformation pipeline and validation transformation pipeline.
         """
         if dataset_name in [SSLDatasetName.RSNAKaggleCXR.value,
@@ -269,6 +274,4 @@ def get_trainer_arguments(self) -> Dict[str, Any]:
                                                       drop_p=0.2,
                                                       learning_rate=self.learning_rate_linear_head_during_ssl_training)
         trainer_kwargs: Dict[str, Any] = {"callbacks": self.online_eval}
-        if self.is_debug_model:
-            trainer_kwargs.update({"limit_train_batches": 1, "limit_val_batches": 1})
         return trainer_kwargs
diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_image_classifier.py b/InnerEye/ML/SSL/lightning_containers/ssl_image_classifier.py
@@ -64,7 +64,6 @@ def get_data_module(self) -> InnerEyeDataModuleTypes:
         return self.data_module
 
     def get_trainer_arguments(self) -> Dict[str, Any]:
-        trained_kwargs = {}
-        if self.is_debug_model:
-            trained_kwargs.update({"limit_train_batches": 1, "limit_val_batches": 1})
-        return trained_kwargs
+        # This class inherits from SSLContainer, where the get_trainer_arguments adds the online evaluator callback.
+        # We don't need that for the classifier, hence need to return an empty set of trainer arguments.
+        return {}
diff --git a/InnerEye/ML/SSL/lightning_modules/byol/byol_module.py b/InnerEye/ML/SSL/lightning_modules/byol/byol_module.py
@@ -9,14 +9,15 @@
 import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
+from health_ml.utils import log_learning_rate, log_on_epoch
 from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
+from pytorch_lightning import Trainer
 from torch import Tensor as T
 from torch.optim import Adam
 
 from InnerEye.ML.SSL.lightning_modules.byol.byol_models import SiameseArm
 from InnerEye.ML.SSL.lightning_modules.byol.byol_moving_average import ByolMovingAverageWeightUpdate
 from InnerEye.ML.SSL.utils import SSLDataModuleType
-from pytorch_lightning import Trainer
 
 SingleBatchType = Tuple[List, T]
 BatchType = Union[Dict[SSLDataModuleType, SingleBatchType], SingleBatchType]
@@ -98,14 +99,15 @@ def shared_step(self, batch: BatchType, batch_idx: int) -> T:
 
         return loss
 
-    def training_step(self, batch: BatchType, batch_idx: int, **kwargs: Any) -> T:  # type: ignore
+    def training_step(self, batch: BatchType, batch_idx: int, **kwargs: Any) -> torch.Tensor:  # type: ignore
         loss = self.shared_step(batch, batch_idx)
-        self.log_dict({'byol/train/loss': loss, 'byol/tau': self.weight_callback.current_tau})
+        log_on_epoch(self, metrics={'byol/train/loss': loss, 'byol/tau': self.weight_callback.current_tau})
+        log_learning_rate(self, name="byol/learning_rate")
         return loss
 
     def validation_step(self, batch: BatchType, batch_idx: int, **kwargs: Any) -> T:  # type: ignore
         loss = self.shared_step(batch, batch_idx)
-        self.log_dict({'byol/val/loss': loss})
+        log_on_epoch(self, 'byol/val/loss', loss)
         return loss
 
     def setup(self, *args: Any, **kwargs: Any) -> None:
@@ -116,9 +118,12 @@ def configure_optimizers(self) -> Any:
         # exclude certain parameters
         parameters = self.exclude_from_wt_decay(self.online_network.named_parameters(),
                                                 weight_decay=self.hparams.weight_decay)  # type: ignore
-        optimizer = Adam(parameters, lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay)  # type: ignore
-        scheduler = LinearWarmupCosineAnnealingLR(
-            optimizer, warmup_epochs=self.hparams.warmup_epochs, max_epochs=self.hparams.max_epochs)   # type: ignore
+        optimizer = Adam(parameters,
+                         lr=self.hparams.learning_rate,  # type: ignore
+                         weight_decay=self.hparams.weight_decay)  # type: ignore
+        scheduler = LinearWarmupCosineAnnealingLR(optimizer,
+                                                  warmup_epochs=self.hparams.warmup_epochs,  # type: ignore
+                                                  max_epochs=self.hparams.max_epochs)  # type: ignore
         return [optimizer], [scheduler]
 
     def exclude_from_wt_decay(self,
@@ -144,4 +149,3 @@ def exclude_from_wt_decay(self,
             {'params': params, 'weight_decay': weight_decay},
             {'params': excluded_params, 'weight_decay': 0.}
         ]
-
diff --git a/InnerEye/ML/SSL/lightning_modules/simclr_module.py b/InnerEye/ML/SSL/lightning_modules/simclr_module.py
@@ -8,13 +8,13 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from health_ml.utils import log_learning_rate, log_on_epoch
 from pl_bolts.models.self_supervised.simclr.simclr_module import SimCLR
-from torch import Tensor as T
 
 from InnerEye.ML.SSL.encoders import SSLEncoder
 from InnerEye.ML.SSL.utils import SSLDataModuleType
 
-SingleBatchType = Tuple[List, T]
+SingleBatchType = Tuple[List, torch.Tensor]
 BatchType = Union[Dict[SSLDataModuleType, SingleBatchType], SingleBatchType]
 
 
@@ -57,7 +57,18 @@ def __init__(self, encoder_name: str, dataset_name: str, use_7x7_first_conv_in_r
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.encoder(x)
 
-    def shared_step(self, batch: BatchType) -> T:
+    def training_step(self, batch: BatchType, batch_idx: int) -> torch.Tensor:
+        loss = self.shared_step(batch)
+        log_on_epoch(self, "simclr/train/loss", loss, sync_dist=False)
+        log_learning_rate(self, name="simclr/learning_rate")
+        return loss
+
+    def validation_step(self, batch: BatchType, batch_idx: int) -> torch.Tensor:  # type: ignore
+        loss = self.shared_step(batch)
+        log_on_epoch(self, "simclr/val/loss", loss, sync_dist=False)
+        return loss
+
+    def shared_step(self, batch: BatchType) -> torch.Tensor:
         batch = batch[SSLDataModuleType.ENCODER] if isinstance(batch, dict) else batch
 
         (img1, img2), y = batch
@@ -72,6 +83,3 @@ def shared_step(self, batch: BatchType) -> T:
         loss = self.nt_xent_loss(z1, z2, self.temperature)
 
         return loss
-
-
-
diff --git a/InnerEye/ML/SSL/lightning_modules/ssl_classifier_module.py b/InnerEye/ML/SSL/lightning_modules/ssl_classifier_module.py
@@ -7,6 +7,7 @@
 import torch
 from torchmetrics import Metric
 from pl_bolts.models.self_supervised import SSLEvaluator
+from health_ml.utils import log_on_epoch
 from torch.nn import functional as F
 
 from InnerEye.ML.SSL.encoders import get_encoder_output_dim
@@ -79,16 +80,16 @@ def shared_step(self, batch: Any, is_training: bool) -> Any:
 
     def training_step(self, batch: Any, batch_id: int, *args: Any, **kwargs: Any) -> Any:  # type: ignore
         loss = self.shared_step(batch, True)
-        self.log("train/loss", loss, on_step=False, on_epoch=True)
+        log_on_epoch(self, "train/loss", loss)
         for metric in self.train_metrics:
-            self.log(f"train/{metric.name}", metric, on_epoch=True, on_step=False)
+            log_on_epoch(self, f"train/{metric.name}", metric)
         return loss
 
     def validation_step(self, batch: Any, batch_id: int, *args: Any, **kwargs: Any) -> None:  # type: ignore
         loss = self.shared_step(batch, is_training=False)
-        self.log('val/loss', loss, on_step=False, on_epoch=True, sync_dist=True)
+        log_on_epoch(self, 'val/loss', loss)
         for metric in self.val_metrics:
-            self.log(f"val/{metric.name}", metric, on_epoch=True, on_step=False)
+            log_on_epoch(self, f"val/{metric.name}", metric)
 
     def get_input_tensors(self, item: ScalarItem) -> List[torch.Tensor]:
         """