diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 4d3eaddd41f90..0a60bca38d2c1 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -26,7 +26,7 @@ jobs: - job: benchmarks timeoutInMinutes: "90" cancelTimeoutInMinutes: "2" - pool: azure-jirka-spot + pool: azure-gpus-spot container: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" @@ -46,5 +46,6 @@ jobs: - bash: python -m pytest benchmarks -v --durations=0 env: PL_RUNNING_BENCHMARKS: 1 + CUDA_LAUNCH_BLOCKING: 1 workingDirectory: tests/tests_pytorch displayName: 'Testing: PyTorch benchmarks' diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f84463a6615b3..a1a3c187f5ea3 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: timeoutInMinutes: "100" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: azure-jirka-spot + pool: azure-gpus-spot container: image: $(image) # default shm size is 64m. Increase it to avoid: @@ -65,7 +65,7 @@ jobs: python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py - displayName: 'Env details' + displayName: 'Env. details' - bash: bash .actions/pull_legacy_checkpoints.sh displayName: 'Get legacy checkpoints' @@ -74,14 +74,19 @@ jobs: workingDirectory: src/pytorch_lightning displayName: 'Testing: PyTorch doctests' - - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + - bash: | + set -eo pipefail + python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: PyTorch standard' + env: + CUDA_LAUNCH_BLOCKING: 1 workingDirectory: tests/tests_pytorch - bash: bash run_standalone_tests.sh workingDirectory: tests/tests_pytorch env: PL_USE_MOCKED_MNIST: "1" + CUDA_LAUNCH_BLOCKING: 1 displayName: 'Testing: PyTorch standalone tests' - bash: | @@ -109,6 +114,7 @@ jobs: workingDirectory: examples env: PL_USE_MOCKED_MNIST: "1" + CUDA_LAUNCH_BLOCKING: 1 displayName: 'Testing: PyTorch examples' - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0 diff --git a/.github/workflows/ci-pytorch_dockers.yml b/.github/workflows/ci-pytorch_dockers.yml index 69d5955c5db33..50ea23f1309e5 100644 --- a/.github/workflows/ci-pytorch_dockers.yml +++ b/.github/workflows/ci-pytorch_dockers.yml @@ -79,8 +79,8 @@ jobs: matrix: include: # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"} - - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "10.2", ubuntu_version: "18.04"} + - {python_version: "3.8", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} # latest (used in Tutorials) - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"} diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 0325671413dbb..a96a22843b794 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -6,6 +6,7 @@ on: schedule: # At the end of every day - cron: "0 0 * * *" + push: {} # fixme env: PUSH_TO_HUB: true @@ -68,7 +69,7 @@ jobs: matrix: # the config used in '.circleci/config.yml`' python_version: ["3.7"] - xla_version: ["1.8"] + xla_version: ["1.11"] steps: - name: Checkout @@ -114,9 +115,9 @@ jobs: fail-fast: false matrix: include: - # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"} - - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + # the config used in '.azure/gpu-tests.yml' + - {python_version: "3.7", pytorch_version: "1.9", cuda_version: "10.2", ubuntu_version: "18.04"} + - {python_version: "3.8", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} # latest (used in Tutorials) - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"} @@ -143,7 +144,7 @@ jobs: UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-cuda${{ matrix.cuda_version }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 95 # report failure to Slack diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index 530cc9ec6fa2c..efce1de9d17cf 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.9 +ARG PYTHON_VERSION=3.7 +ARG PYTORCH_VERSION=1.11 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} diff --git a/tests/tests_pytorch/models/test_restore.py b/tests/tests_pytorch/models/test_restore.py index 77f45928dd907..2573eb9f2a5a2 100644 --- a/tests/tests_pytorch/models/test_restore.py +++ b/tests/tests_pytorch/models/test_restore.py @@ -375,7 +375,7 @@ def test_callbacks_references_fit_ckpt_path(tmpdir): trainer.fit(model, datamodule=dm, ckpt_path=str(tmpdir / "last.ckpt")) -@RunIf(min_cuda_gpus=2) +@RunIf(min_cuda_gpus=2, standalone=True) def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" @@ -424,7 +424,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir): tpipes.run_model_prediction(pretrained_model, dataloader) -@RunIf(min_cuda_gpus=2) +@RunIf(min_cuda_gpus=2, standalone=True) def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): """Verify `test()` on pretrained model.""" tutils.set_random_main_port() @@ -558,7 +558,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template): new_trainer.test(pretrained_model) -@RunIf(min_cuda_gpus=2) +@RunIf(min_cuda_gpus=2, standalone=True) def test_dp_resume(tmpdir): """Make sure DP continues training correctly.""" model = CustomClassificationModelDP(lr=0.1) diff --git a/tests/tests_pytorch/strategies/test_common.py b/tests/tests_pytorch/strategies/test_common.py index 479b222e25a9d..949e0911b50c1 100644 --- a/tests/tests_pytorch/strategies/test_common.py +++ b/tests/tests_pytorch/strategies/test_common.py @@ -34,8 +34,10 @@ "trainer_kwargs", ( pytest.param(dict(accelerator="gpu", devices=1), marks=RunIf(min_cuda_gpus=1)), - pytest.param(dict(strategy="dp", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)), - pytest.param(dict(strategy="ddp_spawn", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)), + pytest.param(dict(strategy="dp", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2, standalone=True)), + pytest.param( + dict(strategy="ddp_spawn", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2, standalone=True) + ), pytest.param(dict(accelerator="mps", devices=1), marks=RunIf(mps=True)), ), ) diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py index 6b01150c80857..0ab4bc97852e9 100644 --- a/tests/tests_pytorch/trainer/test_dataloaders.py +++ b/tests/tests_pytorch/trainer/test_dataloaders.py @@ -831,7 +831,7 @@ def test_dataloader_distributed_sampler_already_attached(tmpdir): assert trainer.state.finished, "DDP Training failed" -@RunIf(min_cuda_gpus=3) +@RunIf(min_cuda_gpus=3, standalone=True) def test_batch_size_smaller_than_num_gpus(tmpdir): # we need at least 3 gpus for this test num_gpus = 3 @@ -869,6 +869,7 @@ def train_dataloader(self): limit_train_batches=0.1, limit_val_batches=0, accelerator="gpu", + strategy="ddp", devices=num_gpus, )