microsoft · ant0nsc · Jul 15, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jun 30, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,8 @@ any large models anymore because data loaders ran out of memory.
 LightningContainer models can get stuck at test set inference.
 - ([#498](https://github.com/microsoft/InnerEye-DeepLearning/pull/498)) Workaround for the problem that downloading
 multiple large checkpoints can time out.
+- ([#515](https://github.com/microsoft/InnerEye-DeepLearning/pull/515)) Workaround for occasional issues with dataset
+mounting and running matplotblib on some machines. Re-instantiated a disabled test.
 
 ### Removed
 

diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py
@@ -229,6 +229,9 @@ def get_or_create_python_environment(azure_config: AzureConfig,
         # Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300.
         "AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900",
         "MKL_SERVICE_FORCE_INTEL": "1",
+        # Switching to a new software stack in AML for mounting datasets
+        "RSLEX_DIRECT_VOLUME_MOUNT": "true",
+        "RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1",
         **(source_config.environment_variables or {})
     }
     base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"

diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py
@@ -7,6 +7,8 @@
 import warnings
 from pathlib import Path
 
+import matplotlib
+
 # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
 # individual warnings only.
 # flake8: noqa
@@ -68,11 +70,12 @@ def initialize_rpdb() -> None:
                  f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}")
 
 
-def suppress_logging_noise() -> None:
+def package_setup_and_hacks() -> None:
     """
-    Reduce the logging level for some of the used libraries, which are particularly talkative in DEBUG mode.
-    Usually when running in DEBUG mode, we want diagnostics about the model building itself, but not for the
-    underlying libraries.
+    Set up the Python packages where needed. In particular, reduce the logging level for some of the used
+    libraries, which are particularly talkative in DEBUG mode. Usually when running in DEBUG mode, we want
+    diagnostics about the model building itself, but not for the underlying libraries.
+    It also adds workarounds for known issues in some packages.
     """
     # Numba code generation is extremely talkative in DEBUG mode, disable that.
     logging.getLogger('numba').setLevel(logging.WARNING)
@@ -89,6 +92,10 @@ def suppress_logging_noise() -> None:
     # This is working around a spurious error message thrown by MKL, see
     # https://github.com/pytorch/pytorch/issues/37377
     os.environ['MKL_THREADING_LAYER'] = 'GNU'
+    # Workaround for issues with matplotlib on some X servers, see
+    # https://stackoverflow.com/questions/45993879/matplot-lib-fatal-io-error-25-inappropriate-ioctl-for-device-on-x
+    # -server-loc
+    matplotlib.use('Agg')
 
 
 class Runner:
@@ -279,7 +286,7 @@ def run_in_situ(self) -> None:
         # build itself, but not the tons of debug information that AzureML submissions create.
         # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
         logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR")
-        suppress_logging_noise()
+        package_setup_and_hacks()
         if is_global_rank_zero():
             self.print_git_tags()
         # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both

diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
@@ -2,23 +2,22 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import shutil
+import uuid
 from copy import copy
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-import shutil
 from typing import Dict, Generic, Iterable, List, Optional, Tuple, Type, TypeVar, Union
-import uuid
-
-import h5py
-from numpy.lib.npyio import NpzFile
-from skimage.transform import resize
 
 import SimpleITK as sitk
+import h5py
 import numpy as np
 import pandas as pd
 import pydicom as dicom
 import torch
+from numpy.lib.npyio import NpzFile
+from skimage.transform import resize
 from tabulate import tabulate
 
 from InnerEye.Common import common_util

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
@@ -368,7 +368,6 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
 
 
 @pytest.mark.after_training_2node
-@pytest.mark.skip("Test times out for unknown reasons.")
 def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
     args_list = ["--model", "BasicModel2EpochsMoreData",
                  "--azureml", "True",