From 6ffaaf8629ef456937b0cb046a72f0a6bb3fe0e2 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Mon, 28 Jun 2021 11:49:04 +0100 Subject: [PATCH 01/10] fixes --- InnerEye/Azure/azure_runner.py | 2 ++ InnerEye/ML/runner.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py index 1b9784d09..768643c61 100644 --- a/InnerEye/Azure/azure_runner.py +++ b/InnerEye/Azure/azure_runner.py @@ -229,6 +229,8 @@ def get_or_create_python_environment(azure_config: AzureConfig, # Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300. "AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900", "MKL_SERVICE_FORCE_INTEL": "1", + # Switching to a new software stack in AML for mounting datasets + "RSLEX_DIRECT_VOLUME_MOUNT": "true", **(source_config.environment_variables or {}) } base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04" diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py index 9ba94d2d4..2ab2c6f4b 100755 --- a/InnerEye/ML/runner.py +++ b/InnerEye/ML/runner.py @@ -7,6 +7,8 @@ import warnings from pathlib import Path +import matplotlib + # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress # individual warnings only. # flake8: noqa @@ -68,11 +70,12 @@ def initialize_rpdb() -> None: f"kill -TRAP ; nc 127.0.0.1 {rpdb_port}") -def suppress_logging_noise() -> None: +def package_setup_and_hacks() -> None: """ - Reduce the logging level for some of the used libraries, which are particularly talkative in DEBUG mode. - Usually when running in DEBUG mode, we want diagnostics about the model building itself, but not for the - underlying libraries. + Set up the Python packages where needed. In particular, reduce the logging level for some of the used + libraries, which are particularly talkative in DEBUG mode. Usually when running in DEBUG mode, we want + diagnostics about the model building itself, but not for the underlying libraries. + It also adds workarounds for known issues in some packages. """ # Numba code generation is extremely talkative in DEBUG mode, disable that. logging.getLogger('numba').setLevel(logging.WARNING) @@ -89,6 +92,10 @@ def suppress_logging_noise() -> None: # This is working around a spurious error message thrown by MKL, see # https://github.com/pytorch/pytorch/issues/37377 os.environ['MKL_THREADING_LAYER'] = 'GNU' + # Workaround for issues with matplotlib on some X servers, see + # https://stackoverflow.com/questions/45993879/matplot-lib-fatal-io-error-25-inappropriate-ioctl-for-device-on-x + # -server-loc + matplotlib.use('Agg') class Runner: @@ -279,7 +286,7 @@ def run_in_situ(self) -> None: # build itself, but not the tons of debug information that AzureML submissions create. # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR") - suppress_logging_noise() + package_setup_and_hacks() if is_global_rank_zero(): self.print_git_tags() # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both From 9d9d6dc22528f39095a903670f60310cb5ac6631 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Mon, 28 Jun 2021 11:51:28 +0100 Subject: [PATCH 02/10] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 92f6631b8..99d33814d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,8 @@ any large models anymore because data loaders ran out of memory. LightningContainer models can get stuck at test set inference. - ([#498](https://github.com/microsoft/InnerEye-DeepLearning/pull/498)) Workaround for the problem that downloading multiple large checkpoints can time out. +- ([#515](https://github.com/microsoft/InnerEye-DeepLearning/pull/515)) Workaround for occasional issues with dataset +mounting and running matplotblib on some machines. ### Removed From 847ef4a616218b2fee5649165af66648651e7eff Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 30 Jun 2021 09:23:03 +0100 Subject: [PATCH 03/10] re-instantiate test --- Tests/AfterTraining/test_after_training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 9b99b6a3b..dbaa9feab 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -368,7 +368,6 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None: @pytest.mark.after_training_2node -@pytest.mark.skip("Test times out for unknown reasons.") def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None: args_list = ["--model", "BasicModel2EpochsMoreData", "--azureml", "True", From 1afd803e38432e3ad2de66c9bef1149295c8b798 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 30 Jun 2021 09:23:50 +0100 Subject: [PATCH 04/10] re-instantiate test --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99d33814d..f581eca64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,7 @@ LightningContainer models can get stuck at test set inference. - ([#498](https://github.com/microsoft/InnerEye-DeepLearning/pull/498)) Workaround for the problem that downloading multiple large checkpoints can time out. - ([#515](https://github.com/microsoft/InnerEye-DeepLearning/pull/515)) Workaround for occasional issues with dataset -mounting and running matplotblib on some machines. +mounting and running matplotblib on some machines. Re-instantiated a disabled test. ### Removed From d4e272bae90f288999429df58abdfb3d8d751f93 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 7 Jul 2021 20:14:00 +0100 Subject: [PATCH 05/10] catch loading errors --- InnerEye/ML/utils/io_util.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py index 02faeafb6..cbfc44d84 100644 --- a/InnerEye/ML/utils/io_util.py +++ b/InnerEye/ML/utils/io_util.py @@ -2,25 +2,25 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +import shutil +import uuid from copy import copy from dataclasses import dataclass from enum import Enum from pathlib import Path -import shutil from typing import Dict, Generic, Iterable, List, Optional, Tuple, Type, TypeVar, Union -import uuid - -import h5py -from numpy.lib.npyio import NpzFile -from skimage.transform import resize import SimpleITK as sitk +import h5py import numpy as np import pandas as pd import pydicom as dicom import torch +from numpy.lib.npyio import NpzFile +from skimage.transform import resize from tabulate import tabulate +from InnerEye.Azure.azure_util import RUN_CONTEXT from InnerEye.Common import common_util from InnerEye.Common.type_annotations import PathOrString, TupleFloat3, TupleInt3 from InnerEye.ML.config import DEFAULT_POSTERIOR_VALUE_RANGE, PhotometricNormalizationMethod, \ @@ -226,7 +226,13 @@ def _is_valid_image_path(_path: Path) -> bool: if path is None or not _is_valid_image_path(path): raise ValueError("Invalid path to image: {}".format(path)) - img, header = read_image_as_array_with_header(path) + try: + img, header = read_image_as_array_with_header(path) + except Exception: + uploaded_name = f"loading_failure_{str(uuid.uuid4().hex)}" + print(f"Unable to read image {path}. Uploading the failed image to AML as {uploaded_name}") + RUN_CONTEXT.upload_file(uploaded_name, path) + raise # ensure a 3D image is loaded if not len(img.shape) == 3: From d2a30d5f3b19ea4526e9725129c5bbc5fd3264f5 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 8 Jul 2021 20:47:43 +0100 Subject: [PATCH 06/10] syntax fix --- InnerEye/ML/utils/io_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py index cbfc44d84..feb4bf58e 100644 --- a/InnerEye/ML/utils/io_util.py +++ b/InnerEye/ML/utils/io_util.py @@ -228,11 +228,11 @@ def _is_valid_image_path(_path: Path) -> bool: try: img, header = read_image_as_array_with_header(path) - except Exception: + except Exception as ex: uploaded_name = f"loading_failure_{str(uuid.uuid4().hex)}" print(f"Unable to read image {path}. Uploading the failed image to AML as {uploaded_name}") RUN_CONTEXT.upload_file(uploaded_name, path) - raise + raise ex # ensure a 3D image is loaded if not len(img.shape) == 3: From ea241fb894ddce0d6db08ab67876e16a7adeade6 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 8 Jul 2021 20:51:40 +0100 Subject: [PATCH 07/10] syntax fix --- InnerEye/ML/utils/io_util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py index feb4bf58e..1e5904428 100644 --- a/InnerEye/ML/utils/io_util.py +++ b/InnerEye/ML/utils/io_util.py @@ -3,6 +3,7 @@ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ import shutil +import time import uuid from copy import copy from dataclasses import dataclass @@ -231,7 +232,10 @@ def _is_valid_image_path(_path: Path) -> bool: except Exception as ex: uploaded_name = f"loading_failure_{str(uuid.uuid4().hex)}" print(f"Unable to read image {path}. Uploading the failed image to AML as {uploaded_name}") + print(f"File has size {path.stat().st_size}") RUN_CONTEXT.upload_file(uploaded_name, path) + time.sleep(30) + print("Done sleeping") raise ex # ensure a 3D image is loaded From a9a1813d21ce032074cb5c6185cd529b5d68f381 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 14 Jul 2021 20:03:23 +0100 Subject: [PATCH 08/10] streaming mode --- InnerEye/Azure/azure_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py index 768643c61..888623c35 100644 --- a/InnerEye/Azure/azure_runner.py +++ b/InnerEye/Azure/azure_runner.py @@ -231,6 +231,7 @@ def get_or_create_python_environment(azure_config: AzureConfig, "MKL_SERVICE_FORCE_INTEL": "1", # Switching to a new software stack in AML for mounting datasets "RSLEX_DIRECT_VOLUME_MOUNT": "true", + "RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1", **(source_config.environment_variables or {}) } base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04" From 09c4fe78c564d5e28796a3583ec813aa6c9affc2 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 14 Jul 2021 22:13:27 +0100 Subject: [PATCH 09/10] undoing test code --- InnerEye/ML/utils/io_util.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py index 1e5904428..fcae8a14f 100644 --- a/InnerEye/ML/utils/io_util.py +++ b/InnerEye/ML/utils/io_util.py @@ -227,16 +227,7 @@ def _is_valid_image_path(_path: Path) -> bool: if path is None or not _is_valid_image_path(path): raise ValueError("Invalid path to image: {}".format(path)) - try: - img, header = read_image_as_array_with_header(path) - except Exception as ex: - uploaded_name = f"loading_failure_{str(uuid.uuid4().hex)}" - print(f"Unable to read image {path}. Uploading the failed image to AML as {uploaded_name}") - print(f"File has size {path.stat().st_size}") - RUN_CONTEXT.upload_file(uploaded_name, path) - time.sleep(30) - print("Done sleeping") - raise ex + img, header = read_image_as_array_with_header(path) # ensure a 3D image is loaded if not len(img.shape) == 3: From 5706d27f177172e7ebdd5e92358882d37f08cf11 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 14 Jul 2021 22:14:08 +0100 Subject: [PATCH 10/10] package cleanup --- InnerEye/ML/utils/io_util.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py index fcae8a14f..cc13155c1 100644 --- a/InnerEye/ML/utils/io_util.py +++ b/InnerEye/ML/utils/io_util.py @@ -3,7 +3,6 @@ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ import shutil -import time import uuid from copy import copy from dataclasses import dataclass @@ -21,7 +20,6 @@ from skimage.transform import resize from tabulate import tabulate -from InnerEye.Azure.azure_util import RUN_CONTEXT from InnerEye.Common import common_util from InnerEye.Common.type_annotations import PathOrString, TupleFloat3, TupleInt3 from InnerEye.ML.config import DEFAULT_POSTERIOR_VALUE_RANGE, PhotometricNormalizationMethod, \