Cancel queued AzureML jobs when starting a PR build (#640)

ant0nsc · javier-alvarez · commit c42937c971df · 2022-01-31T16:09:57.000Z
AzureML jobs from failed previous PR builds do not get cancelled, consuming excessive resources. Now kill all queued and running jobs before starting new ones.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@ loss.
 ### Added
 - ([#648](https://github.com/microsoft/InnerEye-DeepLearning/pull/648)) Add torch_ort to SSL SimCLR. This makes training faster.
 - ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) When supplying a "--tag" argument, the AzureML jobs use that value as the display name, to more easily distinguish run.
+- ([#640](https://github.com/microsoft/InnerEye-DeepLearning/pull/640)) Cancel AzureML jobs from previous runs of the PR build in the same branch to reduce AML load
 - ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor
   GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
   `BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
diff --git a/azure-pipelines/azureml-conda-environment.yml b/azure-pipelines/azureml-conda-environment.yml
@@ -0,0 +1,8 @@
+name: AzureML_SDK
+channels:
+  - defaults
+dependencies:
+  - pip=20.1.1
+  - python=3.7.3
+  - pip:
+      - azureml-sdk==1.36.0
diff --git a/azure-pipelines/build-pr.yml b/azure-pipelines/build-pr.yml
@@ -17,6 +17,12 @@ variables:
   disable.coverage.autogenerate: 'true'
 
 jobs:
+  - job: CancelPreviousJobs
+    pool:
+      vmImage: 'ubuntu-18.04'
+    steps:
+      - template: cancel_aml_jobs.yml
+
   - job: Windows
     pool:
       vmImage: 'windows-2019'
@@ -30,6 +36,7 @@ jobs:
       - template: build.yaml
 
   - job: TrainInAzureML
+    dependsOn: CancelPreviousJobs
     variables:
       - name: tag
         value: 'TrainBasicModel'
@@ -48,6 +55,7 @@ jobs:
           test_run_title: tests_after_training_single_run
 
   - job: RunGpuTestsInAzureML
+    dependsOn: CancelPreviousJobs
     variables:
       - name: tag
         value: 'RunGpuTests'
@@ -70,6 +78,7 @@ jobs:
   # is trained, because we use this build to also check the "submit_for_inference" code, that
   # presently only handles single channel models.
   - job: TrainInAzureMLViaSubmodule
+    dependsOn: CancelPreviousJobs
     variables:
       - name: model
         value: 'BasicModel2Epochs1Channel'
@@ -90,6 +99,7 @@ jobs:
 
   # Train a 2-element ensemble model
   - job: TrainEnsemble
+    dependsOn: CancelPreviousJobs
     variables:
       - name: model
         value: 'BasicModelForEnsembleTest'
@@ -114,6 +124,7 @@ jobs:
 
   # Train a model on 2 nodes
   - job: Train2Nodes
+    dependsOn: CancelPreviousJobs
     variables:
       - name: model
         value: 'BasicModel2EpochsMoreData'
@@ -135,6 +146,7 @@ jobs:
           test_run_title: tests_after_training_2node_run
 
   - job: TrainHelloWorld
+    dependsOn: CancelPreviousJobs
     variables:
       - name: model
         value: 'HelloWorld'
@@ -152,6 +164,7 @@ jobs:
   # Run HelloContainer on 2 nodes. HelloContainer uses native Lighting test set inference, which can get
   # confused after doing multi-node training in the same script.
   - job: TrainHelloContainer
+    dependsOn: CancelPreviousJobs
     variables:
       - name: model
         value: 'HelloContainer'
@@ -176,6 +189,7 @@ jobs:
   # regressions in AML when requesting more than the default amount of memory. This needs to run with all subjects to
   # trigger the bug, total runtime 10min
   - job: TrainLung
+    dependsOn: CancelPreviousJobs
     variables:
       - name: model
         value: 'Lung'
diff --git a/azure-pipelines/build_data_quality.yaml b/azure-pipelines/build_data_quality.yaml
@@ -1,6 +1,8 @@
 steps:
   - template: checkout.yml
 
+  - template: prepare_conda.yml
+
   - bash: |
       conda env create --file InnerEye-DataQuality/environment.yml --name InnerEyeDataQuality
       source activate InnerEyeDataQuality
diff --git a/azure-pipelines/cancel_aml_jobs.py b/azure-pipelines/cancel_aml_jobs.py
@@ -0,0 +1,46 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import os
+
+from azureml._restclient.constants import RunStatus
+from azureml.core import Experiment, Run, Workspace
+from azureml.core.authentication import ServicePrincipalAuthentication
+
+
+def cancel_running_and_queued_jobs() -> None:
+    environ = os.environ
+    print("Authenticating")
+    auth = ServicePrincipalAuthentication(
+        tenant_id='72f988bf-86f1-41af-91ab-2d7cd011db47',
+        service_principal_id=environ["APPLICATION_ID"],
+        service_principal_password=environ["APPLICATION_KEY"])
+    print("Getting AML workspace")
+    workspace = Workspace.get(
+        name="InnerEye-DeepLearning",
+        auth=auth,
+        subscription_id=environ["SUBSCRIPTION_ID"],
+        resource_group="InnerEye-DeepLearning")
+    branch = environ["BRANCH"]
+    print(f"Branch: {branch}")
+    if not branch.startswith("refs/pull/"):
+        print("This branch is not a PR branch, hence not cancelling anything.")
+        exit(0)
+    experiment_name = branch.replace("/", "_")
+    print(f"Experiment: {experiment_name}")
+    experiment = Experiment(workspace, name=experiment_name)
+    print(f"Retrieved experiment {experiment.name}")
+    for run in experiment.get_runs(include_children=True, properties={}):
+        assert isinstance(run, Run)
+        status_suffix = f"'{run.status}' run {run.id} ({run.display_name})"
+        if run.status in (RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.FINALIZING, RunStatus.CANCELED,
+                                       RunStatus.CANCEL_REQUESTED):
+            print(f"Skipping {status_suffix}")
+        else:
+            print(f"Cancelling {status_suffix}")
+            run.cancel()
+
+
+if __name__ == "__main__":
+    cancel_running_and_queued_jobs()
diff --git a/azure-pipelines/cancel_aml_jobs.yml b/azure-pipelines/cancel_aml_jobs.yml
@@ -0,0 +1,27 @@
+steps:
+  - checkout: self
+
+  - template: prepare_conda.yml
+
+  # https://docs.microsoft.com/en-us/azure/devops/pipelines/release/caching?view=azure-devops#pythonanaconda
+  - task: Cache@2
+    displayName: Use cached Conda environment AzureML_SDK
+    inputs:
+      # Beware of changing the cache key or path independently, safest to change in sync
+      key: 'usr_share_miniconda_azureml_conda | "$(Agent.OS)" | azure-pipelines/azureml-conda-environment.yml'
+      path: /usr/share/miniconda/envs
+      cacheHitVar: CONDA_CACHE_RESTORED
+
+  - script: conda env create --file azure-pipelines/azureml-conda-environment.yml
+    displayName: Create Conda environment AzureML_SDK
+    condition: eq(variables.CONDA_CACHE_RESTORED, 'false')
+
+  - bash: |
+      source activate AzureML_SDK
+      python azure-pipelines/cancel_aml_jobs.py
+    displayName: Cancel jobs from previous run
+    env:
+      SUBSCRIPTION_ID: $(InnerEyeDevSubscriptionID)
+      APPLICATION_ID: $(InnerEyeDeepLearningServicePrincipalID)
+      APPLICATION_KEY: $(InnerEyeDeepLearningServicePrincipalKey)
+      BRANCH: $(Build.SourceBranch)
diff --git a/azure-pipelines/checkout.yml b/azure-pipelines/checkout.yml
@@ -2,21 +2,3 @@ steps:
   - checkout: self
     lfs: true
     submodules: true
-
-  - bash: |
-      subdir=bin
-      echo "Adding this directory to PATH: $CONDA/$subdir"
-      echo "##vso[task.prependpath]$CONDA/$subdir"
-    displayName: Add conda to PATH
-    condition: succeeded()
-
-  - bash: |
-      conda install conda=4.8.3 -y
-      conda --version
-      conda list
-    displayName: Print conda version and initial package list
-
-  - bash: |
-      sudo chown -R $USER /usr/share/miniconda
-    condition: and(succeeded(), eq( variables['Agent.OS'], 'Linux' ))
-    displayName: Take ownership of conda installation
diff --git a/azure-pipelines/inner_eye_env.yml b/azure-pipelines/inner_eye_env.yml
@@ -3,6 +3,8 @@ steps:
 
   - template: store_settings.yml
 
+  - template: prepare_conda.yml
+
   # https://docs.microsoft.com/en-us/azure/devops/pipelines/release/caching?view=azure-devops#pythonanaconda
   - task: Cache@2
     displayName: Use cached Conda environment
diff --git a/azure-pipelines/prepare_conda.yml b/azure-pipelines/prepare_conda.yml
@@ -0,0 +1,12 @@
+steps:
+  - bash: |
+      subdir=bin
+      echo "Adding this directory to PATH: $CONDA/$subdir"
+      echo "##vso[task.prependpath]$CONDA/$subdir"
+    displayName: Add conda to PATH
+    condition: succeeded()
+
+  - bash: |
+      sudo chown -R $USER /usr/share/miniconda
+    condition: and(succeeded(), eq( variables['Agent.OS'], 'Linux' ))
+    displayName: Take ownership of conda installation