|
12 | 12 | # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress |
13 | 13 | # individual warnings only. |
14 | 14 | # flake8: noqa |
| 15 | + |
15 | 16 | # Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP, |
16 | 17 | # the working directory is not correctly picked up in sys.path |
17 | 18 | print(f"Starting InnerEye runner at {sys.argv[0]}") |
|
26 | 27 | fixed_paths.add_submodules_to_path() |
27 | 28 |
|
28 | 29 | from azureml._base_sdk_common import user_agent |
| 30 | +from azureml._restclient.constants import RunStatus |
29 | 31 | from azureml.core import Run, ScriptRunConfig |
30 | 32 | from health_azure import AzureRunInfo, submit_to_azure_if_needed |
31 | 33 | from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \ |
@@ -271,15 +273,18 @@ def after_submission_hook(azure_run: Run) -> None: |
271 | 273 | f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}") |
272 | 274 |
|
273 | 275 | if self.azure_config.wait_for_completion: |
274 | | - # We want the job output to be visible on the console, but the program should not exit if the |
275 | | - # job fails because we need to download the pytest result file. |
| 276 | + # We want the job output to be visible on the console. Do not exit yet if the job fails, because we |
| 277 | + # may need to download the pytest result file. |
276 | 278 | azure_run.wait_for_completion(show_output=True, raise_on_error=False) |
277 | | - if self.azure_config.pytest_mark and self.azure_config.wait_for_completion: |
278 | | - # The AzureML job can optionally run pytest. Attempt to download it to the current directory. |
279 | | - # A build step will pick up that file and publish it to Azure DevOps. |
280 | | - # If pytest_mark is set, this file must exist. |
281 | | - logging.info("Downloading pytest result file.") |
282 | | - download_pytest_result(azure_run) |
| 279 | + if self.azure_config.pytest_mark: |
| 280 | + # The AzureML job can optionally run pytest. Attempt to download it to the current directory. |
| 281 | + # A build step will pick up that file and publish it to Azure DevOps. |
| 282 | + # If pytest_mark is set, this file must exist. |
| 283 | + logging.info("Downloading pytest result file.") |
| 284 | + download_pytest_result(azure_run) |
| 285 | + if azure_run.status == RunStatus.FAILED: |
| 286 | + raise ValueError(f"The AzureML run failed. Please check this URL for details: " |
| 287 | + f"{azure_run.get_portal_url()}") |
283 | 288 |
|
284 | 289 | hyperdrive_config = None |
285 | 290 | if self.azure_config.hyperdrive: |
@@ -326,12 +331,14 @@ def after_submission_hook(azure_run: Run) -> None: |
326 | 331 | commandline_args=" ".join(source_config.script_params)), |
327 | 332 | after_submission=after_submission_hook, |
328 | 333 | hyperdrive_config=hyperdrive_config) |
| 334 | + # Set the default display name to what was provided as the "tag" |
| 335 | + if self.azure_config.tag: |
| 336 | + azure_run_info.run.display_name = self.azure_config.tag |
329 | 337 | else: |
330 | 338 | # compute_cluster_name is a required parameter in early versions of the HI-ML package |
331 | 339 | azure_run_info = submit_to_azure_if_needed( |
332 | 340 | input_datasets=input_datasets, |
333 | | - submit_to_azureml=False, |
334 | | - compute_cluster_name="") |
| 341 | + submit_to_azureml=False) |
335 | 342 | finally: |
336 | 343 | if temp_conda: |
337 | 344 | temp_conda.unlink() |
|
0 commit comments