Make pytorch run non-deterministically by default, upgrade to AML SDK 1.36 (#594)

ant0nsc · web-flow · commit 8712267fe7e4 · 2021-11-22T11:47:14.000Z
diff --git a/.idea/InnerEye-DeepLearning.iml b/.idea/InnerEye-DeepLearning.iml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ created.
 ## Upcoming
 
 ### Added
+- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) When supplying a "--tag" argument, the AzureML jobs use that value as the display name, to more easily distinguish run.
 - ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor 
   GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
   `BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
@@ -53,6 +54,7 @@ gets uploaded to AzureML, by skipping all test folders.
 - ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters
   in the config to run inference on both the validation and test sets by default.
 - ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
+- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
 - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
 
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
@@ -7,11 +7,10 @@
 from abc import ABC
 from collections import Counter
 from pathlib import Path
-from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Tuple
+from typing import Any, Callable, Dict, Generic, List, Mapping, Optional, Tuple, TypeVar
 
 import pandas as pd
 import torch.utils.data
-from torch._six import container_abcs
 from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler
 from torch.utils.data.dataloader import default_collate  # type: ignore
 
@@ -36,7 +35,7 @@ def collate_with_metadata(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
     :return: collated result
     """
     elem = batch[0]
-    if isinstance(elem, container_abcs.Mapping):
+    if isinstance(elem, Mapping):
         result = dict()
         for key in elem:
             # Special handling for all fields that store metadata, and for fields that are list.
diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -582,7 +582,7 @@ class TrainerParams(param.Parameterized):
                       doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
                           "steps to run before training, to identify possible problems")
     pl_deterministic: bool = \
-        param.Integer(default=True,
+        param.Boolean(default=False,
                       doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
                           "'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
                           "you may see training speed increases.")
diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py
@@ -243,9 +243,12 @@ def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None:
     def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
         return [self.optimizer], [self.l_rate_scheduler]  # type: ignore
 
+    @rank_zero_only
     def on_fit_end(self) -> None:
         """
-        Flushes all logger objects that the present object holds.
+        Flushes all logger objects that the present object holds. This should only be run on rank zero, because
+        otherwise ranks != 0 will create empty log files that can clash with the non-empty log files written on
+        rank 0.
         """
         self.train_epoch_metrics_logger.flush()
         self.val_epoch_metrics_logger.flush()
diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py
@@ -12,6 +12,7 @@
 # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
 # individual warnings only.
 # flake8: noqa
+
 # Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP,
 # the working directory is not correctly picked up in sys.path
 print(f"Starting InnerEye runner at {sys.argv[0]}")
@@ -26,6 +27,7 @@
 fixed_paths.add_submodules_to_path()
 
 from azureml._base_sdk_common import user_agent
+from azureml._restclient.constants import RunStatus
 from azureml.core import Run, ScriptRunConfig
 from health_azure import AzureRunInfo, submit_to_azure_if_needed
 from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \
@@ -271,15 +273,18 @@ def after_submission_hook(azure_run: Run) -> None:
                       f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")
 
             if self.azure_config.wait_for_completion:
-                # We want the job output to be visible on the console, but the program should not exit if the
-                # job fails because we need to download the pytest result file.
+                # We want the job output to be visible on the console. Do not exit yet if the job fails, because we
+                # may need to download the pytest result file.
                 azure_run.wait_for_completion(show_output=True, raise_on_error=False)
-            if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
-                # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
-                # A build step will pick up that file and publish it to Azure DevOps.
-                # If pytest_mark is set, this file must exist.
-                logging.info("Downloading pytest result file.")
-                download_pytest_result(azure_run)
+                if self.azure_config.pytest_mark:
+                    # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
+                    # A build step will pick up that file and publish it to Azure DevOps.
+                    # If pytest_mark is set, this file must exist.
+                    logging.info("Downloading pytest result file.")
+                    download_pytest_result(azure_run)
+                if azure_run.status == RunStatus.FAILED:
+                    raise ValueError(f"The AzureML run failed. Please check this URL for details: "
+                                     f"{azure_run.get_portal_url()}")
 
         hyperdrive_config = None
         if self.azure_config.hyperdrive:
@@ -326,12 +331,14 @@ def after_submission_hook(azure_run: Run) -> None:
                         commandline_args=" ".join(source_config.script_params)),
                     after_submission=after_submission_hook,
                     hyperdrive_config=hyperdrive_config)
+                # Set the default display name to what was provided as the "tag"
+                if self.azure_config.tag:
+                    azure_run_info.run.display_name = self.azure_config.tag
             else:
                 # compute_cluster_name is a required parameter in early versions of the HI-ML package
                 azure_run_info = submit_to_azure_if_needed(
                     input_datasets=input_datasets,
-                    submit_to_azureml=False,
-                    compute_cluster_name="")
+                    submit_to_azureml=False)
         finally:
             if temp_conda:
                 temp_conda.unlink()
diff --git a/environment.yml b/environment.yml
@@ -12,9 +12,9 @@ dependencies:
       - git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
       - azure-mgmt-resource==12.1.0
       - azure-mgmt-datafactory==1.1.0
-      - azureml-mlflow==1.32.0
-      - azureml-sdk==1.32.0
-      - azureml-tensorboard==1.32.0
+      - azureml-mlflow==1.36.0
+      - azureml-sdk==1.36.0
+      - azureml-tensorboard==1.36.0
       - conda-merge==0.1.5
       - cryptography==3.3.2
       - dataclasses-json==0.5.2