Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions .azure-pipelines/gpu-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,12 @@ jobs:
cancelTimeoutInMinutes: "2"
pool: azure-gpus-spot
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
workspace:
clean: all

steps:
- bash: |
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
pip list
displayName: 'Install PyTorch LTS'

- bash: |
python -m pytest tests/benchmarks -v --durations=0
displayName: 'Testing: benchmarks'
Expand Down
13 changes: 7 additions & 6 deletions .azure-pipelines/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,21 @@ pr:

jobs:
- job: pytest
strategy:
matrix:
'PyTorch - LTS':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
'PyTorch - stable':
Comment on lines +23 to +25
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can we shorten this text?

Suggested change
'PyTorch - LTS':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
'PyTorch - stable':
'LTS':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
'stable':

image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
# how long to run the job before automatically cancelling
timeoutInMinutes: "65"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"

pool: azure-gpus-spot

# ToDo: this need to have installed docker in the base image...
container:
# base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
# run on torch 1.8 as it's the LTS version
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
image: $(image)
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
Expand All @@ -52,8 +55,6 @@ jobs:
- bash: |
python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
pip install . --requirement requirements/devel.txt
pip install . --requirement requirements/strategies.txt
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/ci_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ jobs:
pip install "check-jsonschema>=0.10"

- name: GH Workflows
run: |
check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"

- name: Azure Pipelines
run: |
check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
env:
SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"
3 changes: 1 addition & 2 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG CUDA_VERSION=11.3.1
ARG UBUNTU_VERSION=20.04
ARG CUDA_VERSION=11.3.1

# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG PYTHON_VERSION=3.9
Expand Down
14 changes: 7 additions & 7 deletions tests/strategies/test_ddp_fully_sharded_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
from tests.helpers.boring_model import BoringModel
from tests.helpers.runif import RunIf

if _TORCH_GREATER_EQUAL_1_11:
if _TORCH_GREATER_EQUAL_1_12:
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
from torch.distributed.fsdp.wrap import wrap


@RunIf(min_torch="1.11")
@RunIf(min_torch="1.12dev")
def test_invalid_on_cpu(tmpdir):
"""Test to ensure that to raise Misconfiguration for Native FSDP on CPU."""
with pytest.raises(
Expand All @@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir):
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("torch.cuda.is_available", return_value=True)
@RunIf(min_torch="1.11")
@RunIf(min_torch="1.12dev")
def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
"""Test to ensure that plugin native amp plugin raises Misconfiguration error."""
with pytest.raises(
Expand Down Expand Up @@ -102,7 +102,7 @@ def _assert_layer_fsdp_instance(self) -> None:
assert self.layer.module[2].reshard_after_forward is True


@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
"""Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""

Expand All @@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))


@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11")
@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev")
def test_fully_sharded_native_strategy_checkpoint(tmpdir):
"""Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""

Expand All @@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir):
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))


@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
"""Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""

Expand Down