Lightning-AI · Borda · Jun 7, 2022 · Jun 9, 2022 · Jun 9, 2022 · Jun 9, 2022
@@ -26,7 +26,7 @@ jobs:
   - job: benchmarks
     timeoutInMinutes: "90"
     cancelTimeoutInMinutes: "2"
-    pool: azure-jirka-spot
+    pool: azure-gpus-spot
     container:
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
@@ -46,5 +46,6 @@ jobs:
     - bash: python -m pytest benchmarks -v --durations=0
       env:
         PL_RUNNING_BENCHMARKS: 1
+        CUDA_LAUNCH_BLOCKING: 1
       workingDirectory: tests/tests_pytorch
       displayName: 'Testing: PyTorch benchmarks'
@@ -26,7 +26,7 @@ jobs:
     timeoutInMinutes: "100"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
-    pool: azure-jirka-spot
+    pool: azure-gpus-spot
     container:
       image: $(image)
       # default shm size is 64m. Increase it to avoid:
@@ -65,7 +65,7 @@ jobs:
         python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
         python requirements/pytorch/check-avail-strategies.py
         python requirements/pytorch/check-avail-extras.py
-      displayName: 'Env details'
+      displayName: 'Env. details'
 
     - bash: bash .actions/pull_legacy_checkpoints.sh
       displayName: 'Get legacy checkpoints'
@@ -74,14 +74,19 @@ jobs:
       workingDirectory: src/pytorch_lightning
       displayName: 'Testing: PyTorch doctests'
 
-    - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+    - bash: |
+        set -eo pipefail
+        python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: PyTorch standard'
+      env:
+        CUDA_LAUNCH_BLOCKING: 1
       workingDirectory: tests/tests_pytorch
 
     - bash: bash run_standalone_tests.sh
       workingDirectory: tests/tests_pytorch
       env:
         PL_USE_MOCKED_MNIST: "1"
+        CUDA_LAUNCH_BLOCKING: 1
       displayName: 'Testing: PyTorch standalone tests'
 
     - bash: |
@@ -109,6 +114,7 @@ jobs:
       workingDirectory: examples
       env:
         PL_USE_MOCKED_MNIST: "1"
+        CUDA_LAUNCH_BLOCKING: 1
       displayName: 'Testing: PyTorch examples'
 
     - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0

@@ -79,8 +79,8 @@ jobs:
       matrix:
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
-          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
+          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "10.2", ubuntu_version: "18.04"}
+          - {python_version: "3.8", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
           # latest (used in Tutorials)
           - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
           - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}

@@ -6,6 +6,7 @@ on:
   schedule:
     # At the end of every day
     - cron: "0 0 * * *"
+  push: {}  # fixme
 
 env:
   PUSH_TO_HUB: true
@@ -68,7 +69,7 @@ jobs:
       matrix:
         # the config used in '.circleci/config.yml`'
         python_version: ["3.7"]
-        xla_version: ["1.8"]
+        xla_version: ["1.11"]
 
     steps:
       - name: Checkout
@@ -114,9 +115,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
-          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
+          # the config used in '.azure/gpu-tests.yml'
+          - {python_version: "3.7", pytorch_version: "1.9", cuda_version: "10.2", ubuntu_version: "18.04"}
+          - {python_version: "3.8", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
           # latest (used in Tutorials)
           - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
           - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
@@ -143,7 +144,7 @@ jobs:
             UBUNTU_VERSION=${{ matrix.ubuntu_version }}
           file: dockers/base-cuda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
-          tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+          tags: pytorchlightning/pytorch_lightning:base-cuda${{ matrix.cuda_version }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
         timeout-minutes: 95
 
       # report failure to Slack

@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTHON_VERSION=3.9
-ARG PYTORCH_VERSION=1.9
+ARG PYTHON_VERSION=3.7
+ARG PYTORCH_VERSION=1.11
 
 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
 

@@ -375,7 +375,7 @@ def test_callbacks_references_fit_ckpt_path(tmpdir):
     trainer.fit(model, datamodule=dm, ckpt_path=str(tmpdir / "last.ckpt"))
 
 
-@RunIf(min_cuda_gpus=2)
+@RunIf(min_cuda_gpus=2, standalone=True)
 def test_running_test_pretrained_model_distrib_dp(tmpdir):
     """Verify `test()` on pretrained model."""
 
@@ -424,7 +424,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
         tpipes.run_model_prediction(pretrained_model, dataloader)
 
 
-@RunIf(min_cuda_gpus=2)
+@RunIf(min_cuda_gpus=2, standalone=True)
 def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
     """Verify `test()` on pretrained model."""
     tutils.set_random_main_port()
@@ -558,7 +558,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
     new_trainer.test(pretrained_model)
 
 
-@RunIf(min_cuda_gpus=2)
+@RunIf(min_cuda_gpus=2, standalone=True)
 def test_dp_resume(tmpdir):
     """Make sure DP continues training correctly."""
     model = CustomClassificationModelDP(lr=0.1)

@@ -34,8 +34,10 @@
     "trainer_kwargs",
     (
         pytest.param(dict(accelerator="gpu", devices=1), marks=RunIf(min_cuda_gpus=1)),
-        pytest.param(dict(strategy="dp", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)),
-        pytest.param(dict(strategy="ddp_spawn", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)),
+        pytest.param(dict(strategy="dp", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2, standalone=True)),
+        pytest.param(
+            dict(strategy="ddp_spawn", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2, standalone=True)
+        ),
         pytest.param(dict(accelerator="mps", devices=1), marks=RunIf(mps=True)),
     ),
 )

@@ -831,7 +831,7 @@ def test_dataloader_distributed_sampler_already_attached(tmpdir):
     assert trainer.state.finished, "DDP Training failed"
 
 
-@RunIf(min_cuda_gpus=3)
+@RunIf(min_cuda_gpus=3, standalone=True)
 def test_batch_size_smaller_than_num_gpus(tmpdir):
     # we need at least 3 gpus for this test
     num_gpus = 3
@@ -869,6 +869,7 @@ def train_dataloader(self):
         limit_train_batches=0.1,
         limit_val_batches=0,
         accelerator="gpu",
+        strategy="ddp",
         devices=num_gpus,
     )