Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 8712267

Browse files
authored
Make pytorch run non-deterministically by default, upgrade to AML SDK 1.36 (#594)
1 parent b96afc3 commit 8712267

File tree

7 files changed

+30
-18
lines changed

7 files changed

+30
-18
lines changed

.idea/InnerEye-DeepLearning.iml

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ created.
1313
## Upcoming
1414

1515
### Added
16+
- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) When supplying a "--tag" argument, the AzureML jobs use that value as the display name, to more easily distinguish run.
1617
- ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor
1718
GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
1819
`BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
@@ -53,6 +54,7 @@ gets uploaded to AzureML, by skipping all test folders.
5354
- ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters
5455
in the config to run inference on both the validation and test sets by default.
5556
- ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
57+
- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
5658
- ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
5759
- ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
5860

InnerEye/ML/dataset/full_image_dataset.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,10 @@
77
from abc import ABC
88
from collections import Counter
99
from pathlib import Path
10-
from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Tuple
10+
from typing import Any, Callable, Dict, Generic, List, Mapping, Optional, Tuple, TypeVar
1111

1212
import pandas as pd
1313
import torch.utils.data
14-
from torch._six import container_abcs
1514
from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler
1615
from torch.utils.data.dataloader import default_collate # type: ignore
1716

@@ -36,7 +35,7 @@ def collate_with_metadata(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
3635
:return: collated result
3736
"""
3837
elem = batch[0]
39-
if isinstance(elem, container_abcs.Mapping):
38+
if isinstance(elem, Mapping):
4039
result = dict()
4140
for key in elem:
4241
# Special handling for all fields that store metadata, and for fields that are list.

InnerEye/ML/deep_learning_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ class TrainerParams(param.Parameterized):
582582
doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
583583
"steps to run before training, to identify possible problems")
584584
pl_deterministic: bool = \
585-
param.Integer(default=True,
585+
param.Boolean(default=False,
586586
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
587587
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
588588
"you may see training speed increases.")

InnerEye/ML/lightning_base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,12 @@ def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None:
243243
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
244244
return [self.optimizer], [self.l_rate_scheduler] # type: ignore
245245

246+
@rank_zero_only
246247
def on_fit_end(self) -> None:
247248
"""
248-
Flushes all logger objects that the present object holds.
249+
Flushes all logger objects that the present object holds. This should only be run on rank zero, because
250+
otherwise ranks != 0 will create empty log files that can clash with the non-empty log files written on
251+
rank 0.
249252
"""
250253
self.train_epoch_metrics_logger.flush()
251254
self.val_epoch_metrics_logger.flush()

InnerEye/ML/runner.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
1313
# individual warnings only.
1414
# flake8: noqa
15+
1516
# Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP,
1617
# the working directory is not correctly picked up in sys.path
1718
print(f"Starting InnerEye runner at {sys.argv[0]}")
@@ -26,6 +27,7 @@
2627
fixed_paths.add_submodules_to_path()
2728

2829
from azureml._base_sdk_common import user_agent
30+
from azureml._restclient.constants import RunStatus
2931
from azureml.core import Run, ScriptRunConfig
3032
from health_azure import AzureRunInfo, submit_to_azure_if_needed
3133
from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \
@@ -271,15 +273,18 @@ def after_submission_hook(azure_run: Run) -> None:
271273
f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")
272274

273275
if self.azure_config.wait_for_completion:
274-
# We want the job output to be visible on the console, but the program should not exit if the
275-
# job fails because we need to download the pytest result file.
276+
# We want the job output to be visible on the console. Do not exit yet if the job fails, because we
277+
# may need to download the pytest result file.
276278
azure_run.wait_for_completion(show_output=True, raise_on_error=False)
277-
if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
278-
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
279-
# A build step will pick up that file and publish it to Azure DevOps.
280-
# If pytest_mark is set, this file must exist.
281-
logging.info("Downloading pytest result file.")
282-
download_pytest_result(azure_run)
279+
if self.azure_config.pytest_mark:
280+
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
281+
# A build step will pick up that file and publish it to Azure DevOps.
282+
# If pytest_mark is set, this file must exist.
283+
logging.info("Downloading pytest result file.")
284+
download_pytest_result(azure_run)
285+
if azure_run.status == RunStatus.FAILED:
286+
raise ValueError(f"The AzureML run failed. Please check this URL for details: "
287+
f"{azure_run.get_portal_url()}")
283288

284289
hyperdrive_config = None
285290
if self.azure_config.hyperdrive:
@@ -326,12 +331,14 @@ def after_submission_hook(azure_run: Run) -> None:
326331
commandline_args=" ".join(source_config.script_params)),
327332
after_submission=after_submission_hook,
328333
hyperdrive_config=hyperdrive_config)
334+
# Set the default display name to what was provided as the "tag"
335+
if self.azure_config.tag:
336+
azure_run_info.run.display_name = self.azure_config.tag
329337
else:
330338
# compute_cluster_name is a required parameter in early versions of the HI-ML package
331339
azure_run_info = submit_to_azure_if_needed(
332340
input_datasets=input_datasets,
333-
submit_to_azureml=False,
334-
compute_cluster_name="")
341+
submit_to_azureml=False)
335342
finally:
336343
if temp_conda:
337344
temp_conda.unlink()

environment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ dependencies:
1212
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
1313
- azure-mgmt-resource==12.1.0
1414
- azure-mgmt-datafactory==1.1.0
15-
- azureml-mlflow==1.32.0
16-
- azureml-sdk==1.32.0
17-
- azureml-tensorboard==1.32.0
15+
- azureml-mlflow==1.36.0
16+
- azureml-sdk==1.36.0
17+
- azureml-tensorboard==1.36.0
1818
- conda-merge==0.1.5
1919
- cryptography==3.3.2
2020
- dataclasses-json==0.5.2

0 commit comments

Comments
 (0)