CI: Azure - multiple configs (#12984)

Borda · akihironitta · carmocca · web-flow · commit fab2ff35ad98 · 2022-05-14T01:59:03.000Z
* CI: Azure - multiple configs
* names
* benchmark
* Apply suggestions from code review

Co-authored-by: Akihiro Nitta &lt;nitta@akihironitta.com&gt;
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml
@@ -28,18 +28,12 @@ jobs:
     cancelTimeoutInMinutes: "2"
     pool: azure-gpus-spot
     container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
     workspace:
       clean: all
 
     steps:
-      - bash: |
-          # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
-          pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
-          pip list
-        displayName: 'Install PyTorch LTS'
-
       - bash: |
           python -m pytest tests/benchmarks -v --durations=0
         displayName: 'Testing: benchmarks'
diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
@@ -18,18 +18,21 @@ pr:
 
 jobs:
   - job: pytest
+    strategy:
+      matrix:
+        'PyTorch - LTS':
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+        'PyTorch - stable':
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "65"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
 
     pool: azure-gpus-spot
 
-    # ToDo: this need to have installed docker in the base image...
     container:
-      # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      # run on torch 1.8 as it's the LTS version
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      image: $(image)
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
@@ -52,8 +55,6 @@ jobs:
     - bash: |
         python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
-        # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
-        pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
         pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
         pip install . --requirement requirements/devel.txt
         pip install . --requirement requirements/strategies.txt
diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml
@@ -16,9 +16,9 @@ jobs:
           pip install "check-jsonschema>=0.10"
 
       - name: GH Workflows
-        run: |
-          check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
+        run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
 
       - name: Azure Pipelines
-        run: |
-          check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
+        env:
+          SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
+        run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG CUDA_VERSION=11.3.1
 ARG UBUNTU_VERSION=20.04
+ARG CUDA_VERSION=11.3.1
 
-# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 
 ARG PYTHON_VERSION=3.9
diff --git a/tests/strategies/test_ddp_fully_sharded_native.py b/tests/strategies/test_ddp_fully_sharded_native.py
@@ -9,16 +9,16 @@
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 
-if _TORCH_GREATER_EQUAL_1_11:
+if _TORCH_GREATER_EQUAL_1_12:
     from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
     from torch.distributed.fsdp.wrap import wrap
 
 
-@RunIf(min_torch="1.11")
+@RunIf(min_torch="1.12dev")
 def test_invalid_on_cpu(tmpdir):
     """Test to ensure that to raise Misconfiguration for Native FSDP on CPU."""
     with pytest.raises(
@@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir):
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 @mock.patch("torch.cuda.device_count", return_value=1)
 @mock.patch("torch.cuda.is_available", return_value=True)
-@RunIf(min_torch="1.11")
+@RunIf(min_torch="1.12dev")
 def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
     """Test to ensure that plugin native amp plugin raises Misconfiguration error."""
     with pytest.raises(
@@ -102,7 +102,7 @@ def _assert_layer_fsdp_instance(self) -> None:
         assert self.layer.module[2].reshard_after_forward is True
 
 
-@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
+@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
 def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
     """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""
 
@@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
     _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
 
 
-@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11")
+@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev")
 def test_fully_sharded_native_strategy_checkpoint(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
 
@@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir):
     _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
 
 
-@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
+@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
 def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""