CI: Azure - multiple configs (#12984)

Borda · akihironitta · carmocca · carmocca · commit 460147dd4e9a · 2022-05-24T15:43:58.000+02:00
* CI: Azure - multiple configs
* names
* benchmark
* Apply suggestions from code review

Co-authored-by: Akihiro Nitta &lt;nitta@akihironitta.com&gt;
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml
@@ -28,18 +28,12 @@ jobs:
     cancelTimeoutInMinutes: "2"
     pool: azure-gpus-spot
     container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
     workspace:
       clean: all
 
     steps:
-      - bash: |
-          # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
-          pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
-          pip list
-        displayName: 'Install PyTorch LTS'
-
       - bash: |
           python -m pytest tests/benchmarks -v --durations=0
         displayName: 'Testing: benchmarks'
diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
@@ -18,18 +18,21 @@ pr:
 
 jobs:
   - job: pytest
+    strategy:
+      matrix:
+        'PyTorch - LTS':
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+        'PyTorch - stable':
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "65"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
 
     pool: azure-gpus-spot
 
-    # ToDo: this need to have installed docker in the base image...
     container:
-      # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      # run on torch 1.8 as it's the LTS version
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      image: $(image)
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
@@ -52,8 +55,6 @@ jobs:
     - bash: |
         python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
-        # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
-        pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
         pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
         pip install . --requirement requirements/devel.txt
         pip install . --requirement requirements/strategies.txt
diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml
@@ -16,9 +16,9 @@ jobs:
           pip install "check-jsonschema>=0.10"
 
       - name: GH Workflows
-        run: |
-          check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
+        run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
 
       - name: Azure Pipelines
-        run: |
-          check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
+        env:
+          SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
+        run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG CUDA_VERSION=11.3.1
 ARG UBUNTU_VERSION=20.04
+ARG CUDA_VERSION=11.3.1
 
-# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 
 ARG PYTHON_VERSION=3.9