diff --git a/CHANGELOG.md b/CHANGELOG.md index eafd985e0..f2733013a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,8 @@ any large models anymore because data loaders ran out of memory. LightningContainer models can get stuck at test set inference. - ([#498](https://github.com/microsoft/InnerEye-DeepLearning/pull/498)) Workaround for the problem that downloading multiple large checkpoints can time out. +- ([#515](https://github.com/microsoft/InnerEye-DeepLearning/pull/515)) Workaround for occasional issues with dataset +mounting and running matplotblib on some machines. Re-instantiated a disabled test. ### Removed diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py index 1b9784d09..888623c35 100644 --- a/InnerEye/Azure/azure_runner.py +++ b/InnerEye/Azure/azure_runner.py @@ -229,6 +229,9 @@ def get_or_create_python_environment(azure_config: AzureConfig, # Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300. "AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900", "MKL_SERVICE_FORCE_INTEL": "1", + # Switching to a new software stack in AML for mounting datasets + "RSLEX_DIRECT_VOLUME_MOUNT": "true", + "RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1", **(source_config.environment_variables or {}) } base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04" diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py index 9ba94d2d4..2ab2c6f4b 100755 --- a/InnerEye/ML/runner.py +++ b/InnerEye/ML/runner.py @@ -7,6 +7,8 @@ import warnings from pathlib import Path +import matplotlib + # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress # individual warnings only. # flake8: noqa @@ -68,11 +70,12 @@ def initialize_rpdb() -> None: f"kill -TRAP ; nc 127.0.0.1 {rpdb_port}") -def suppress_logging_noise() -> None: +def package_setup_and_hacks() -> None: """ - Reduce the logging level for some of the used libraries, which are particularly talkative in DEBUG mode. - Usually when running in DEBUG mode, we want diagnostics about the model building itself, but not for the - underlying libraries. + Set up the Python packages where needed. In particular, reduce the logging level for some of the used + libraries, which are particularly talkative in DEBUG mode. Usually when running in DEBUG mode, we want + diagnostics about the model building itself, but not for the underlying libraries. + It also adds workarounds for known issues in some packages. """ # Numba code generation is extremely talkative in DEBUG mode, disable that. logging.getLogger('numba').setLevel(logging.WARNING) @@ -89,6 +92,10 @@ def suppress_logging_noise() -> None: # This is working around a spurious error message thrown by MKL, see # https://github.com/pytorch/pytorch/issues/37377 os.environ['MKL_THREADING_LAYER'] = 'GNU' + # Workaround for issues with matplotlib on some X servers, see + # https://stackoverflow.com/questions/45993879/matplot-lib-fatal-io-error-25-inappropriate-ioctl-for-device-on-x + # -server-loc + matplotlib.use('Agg') class Runner: @@ -279,7 +286,7 @@ def run_in_situ(self) -> None: # build itself, but not the tons of debug information that AzureML submissions create. # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR") - suppress_logging_noise() + package_setup_and_hacks() if is_global_rank_zero(): self.print_git_tags() # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py index 02faeafb6..cc13155c1 100644 --- a/InnerEye/ML/utils/io_util.py +++ b/InnerEye/ML/utils/io_util.py @@ -2,23 +2,22 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +import shutil +import uuid from copy import copy from dataclasses import dataclass from enum import Enum from pathlib import Path -import shutil from typing import Dict, Generic, Iterable, List, Optional, Tuple, Type, TypeVar, Union -import uuid - -import h5py -from numpy.lib.npyio import NpzFile -from skimage.transform import resize import SimpleITK as sitk +import h5py import numpy as np import pandas as pd import pydicom as dicom import torch +from numpy.lib.npyio import NpzFile +from skimage.transform import resize from tabulate import tabulate from InnerEye.Common import common_util diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 9b99b6a3b..dbaa9feab 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -368,7 +368,6 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None: @pytest.mark.after_training_2node -@pytest.mark.skip("Test times out for unknown reasons.") def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None: args_list = ["--model", "BasicModel2EpochsMoreData", "--azureml", "True",