Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ any large models anymore because data loaders ran out of memory.
LightningContainer models can get stuck at test set inference.
- ([#498](https://github.com/microsoft/InnerEye-DeepLearning/pull/498)) Workaround for the problem that downloading
multiple large checkpoints can time out.
- ([#515](https://github.com/microsoft/InnerEye-DeepLearning/pull/515)) Workaround for occasional issues with dataset
mounting and running matplotblib on some machines. Re-instantiated a disabled test.

### Removed

Expand Down
3 changes: 3 additions & 0 deletions InnerEye/Azure/azure_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ def get_or_create_python_environment(azure_config: AzureConfig,
# Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300.
"AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900",
"MKL_SERVICE_FORCE_INTEL": "1",
# Switching to a new software stack in AML for mounting datasets
"RSLEX_DIRECT_VOLUME_MOUNT": "true",
"RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1",
**(source_config.environment_variables or {})
}
base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
Expand Down
17 changes: 12 additions & 5 deletions InnerEye/ML/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import warnings
from pathlib import Path

import matplotlib

# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
# individual warnings only.
# flake8: noqa
Expand Down Expand Up @@ -68,11 +70,12 @@ def initialize_rpdb() -> None:
f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}")


def suppress_logging_noise() -> None:
def package_setup_and_hacks() -> None:
"""
Reduce the logging level for some of the used libraries, which are particularly talkative in DEBUG mode.
Usually when running in DEBUG mode, we want diagnostics about the model building itself, but not for the
underlying libraries.
Set up the Python packages where needed. In particular, reduce the logging level for some of the used
libraries, which are particularly talkative in DEBUG mode. Usually when running in DEBUG mode, we want
diagnostics about the model building itself, but not for the underlying libraries.
It also adds workarounds for known issues in some packages.
"""
# Numba code generation is extremely talkative in DEBUG mode, disable that.
logging.getLogger('numba').setLevel(logging.WARNING)
Expand All @@ -89,6 +92,10 @@ def suppress_logging_noise() -> None:
# This is working around a spurious error message thrown by MKL, see
# https://github.com/pytorch/pytorch/issues/37377
os.environ['MKL_THREADING_LAYER'] = 'GNU'
# Workaround for issues with matplotlib on some X servers, see
# https://stackoverflow.com/questions/45993879/matplot-lib-fatal-io-error-25-inappropriate-ioctl-for-device-on-x
# -server-loc
matplotlib.use('Agg')


class Runner:
Expand Down Expand Up @@ -279,7 +286,7 @@ def run_in_situ(self) -> None:
# build itself, but not the tons of debug information that AzureML submissions create.
# Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR")
suppress_logging_noise()
package_setup_and_hacks()
if is_global_rank_zero():
self.print_git_tags()
# For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both
Expand Down
11 changes: 5 additions & 6 deletions InnerEye/ML/utils/io_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,22 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import shutil
import uuid
from copy import copy
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
import shutil
from typing import Dict, Generic, Iterable, List, Optional, Tuple, Type, TypeVar, Union
import uuid

import h5py
from numpy.lib.npyio import NpzFile
from skimage.transform import resize

import SimpleITK as sitk
import h5py
import numpy as np
import pandas as pd
import pydicom as dicom
import torch
from numpy.lib.npyio import NpzFile
from skimage.transform import resize
from tabulate import tabulate

from InnerEye.Common import common_util
Expand Down
1 change: 0 additions & 1 deletion Tests/AfterTraining/test_after_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,6 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:


@pytest.mark.after_training_2node
@pytest.mark.skip("Test times out for unknown reasons.")
def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
args_list = ["--model", "BasicModel2EpochsMoreData",
"--azureml", "True",
Expand Down