Skip to content

Commit fab2ff3

Browse files
Bordaakihironittacarmocca
authored
CI: Azure - multiple configs (#12984)
* CI: Azure - multiple configs * names * benchmark * Apply suggestions from code review Co-authored-by: Akihiro Nitta <[email protected]> Co-authored-by: Carlos Mocholí <[email protected]>
1 parent d28e365 commit fab2ff3

File tree

5 files changed

+20
-26
lines changed

5 files changed

+20
-26
lines changed

.azure-pipelines/gpu-benchmark.yml

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,12 @@ jobs:
2828
cancelTimeoutInMinutes: "2"
2929
pool: azure-gpus-spot
3030
container:
31-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
31+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
3232
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
3333
workspace:
3434
clean: all
3535

3636
steps:
37-
- bash: |
38-
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
39-
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
40-
pip list
41-
displayName: 'Install PyTorch LTS'
42-
4337
- bash: |
4438
python -m pytest tests/benchmarks -v --durations=0
4539
displayName: 'Testing: benchmarks'

.azure-pipelines/gpu-tests.yml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,21 @@ pr:
1818

1919
jobs:
2020
- job: pytest
21+
strategy:
22+
matrix:
23+
'PyTorch - LTS':
24+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
25+
'PyTorch - stable':
26+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
2127
# how long to run the job before automatically cancelling
2228
timeoutInMinutes: "65"
2329
# how much time to give 'run always even if cancelled tasks' before stopping them
2430
cancelTimeoutInMinutes: "2"
2531

2632
pool: azure-gpus-spot
2733

28-
# ToDo: this need to have installed docker in the base image...
2934
container:
30-
# base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
31-
# run on torch 1.8 as it's the LTS version
32-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
35+
image: $(image)
3336
# default shm size is 64m. Increase it to avoid:
3437
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
3538
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
@@ -52,8 +55,6 @@ jobs:
5255
- bash: |
5356
python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
5457
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
55-
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
56-
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
5758
pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
5859
pip install . --requirement requirements/devel.txt
5960
pip install . --requirement requirements/strategies.txt

.github/workflows/ci_schema.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ jobs:
1616
pip install "check-jsonschema>=0.10"
1717
1818
- name: GH Workflows
19-
run: |
20-
check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
19+
run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
2120

2221
- name: Azure Pipelines
23-
run: |
24-
check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
22+
env:
23+
SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
24+
run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"

dockers/base-cuda/Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
ARG CUDA_VERSION=11.3.1
1615
ARG UBUNTU_VERSION=20.04
16+
ARG CUDA_VERSION=11.3.1
1717

18-
# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
1918
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
2019

2120
ARG PYTHON_VERSION=3.9

tests/strategies/test_ddp_fully_sharded_native.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@
99
from pytorch_lightning.callbacks import ModelCheckpoint
1010
from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
1111
from pytorch_lightning.utilities.exceptions import MisconfigurationException
12-
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
12+
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
1313
from tests.helpers.boring_model import BoringModel
1414
from tests.helpers.runif import RunIf
1515

16-
if _TORCH_GREATER_EQUAL_1_11:
16+
if _TORCH_GREATER_EQUAL_1_12:
1717
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
1818
from torch.distributed.fsdp.wrap import wrap
1919

2020

21-
@RunIf(min_torch="1.11")
21+
@RunIf(min_torch="1.12dev")
2222
def test_invalid_on_cpu(tmpdir):
2323
"""Test to ensure that to raise Misconfiguration for Native FSDP on CPU."""
2424
with pytest.raises(
@@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir):
3434
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
3535
@mock.patch("torch.cuda.device_count", return_value=1)
3636
@mock.patch("torch.cuda.is_available", return_value=True)
37-
@RunIf(min_torch="1.11")
37+
@RunIf(min_torch="1.12dev")
3838
def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
3939
"""Test to ensure that plugin native amp plugin raises Misconfiguration error."""
4040
with pytest.raises(
@@ -102,7 +102,7 @@ def _assert_layer_fsdp_instance(self) -> None:
102102
assert self.layer.module[2].reshard_after_forward is True
103103

104104

105-
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
105+
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
106106
def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
107107
"""Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""
108108

@@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
119119
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
120120

121121

122-
@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11")
122+
@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev")
123123
def test_fully_sharded_native_strategy_checkpoint(tmpdir):
124124
"""Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
125125

@@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir):
130130
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
131131

132132

133-
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
133+
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
134134
def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
135135
"""Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""
136136

0 commit comments

Comments
 (0)