diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml index dee425054d5..dd850c14e87 100644 --- a/.github/workflows/prototype-tests-linux-gpu.yml +++ b/.github/workflows/prototype-tests-linux-gpu.yml @@ -18,7 +18,7 @@ jobs: - python-version: "3.8" runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "11.7" + gpu-arch-version: "11.8" fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c1010d9aaf3..cd6011b4ad4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,7 +24,7 @@ jobs: - python-version: 3.8 runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "11.7" + gpu-arch-version: "11.8" fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: @@ -85,7 +85,7 @@ jobs: - python-version: "3.8" runner: windows.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "11.7" + gpu-arch-version: "11.8" fail-fast: false uses: pytorch/test-infra/.github/workflows/windows_job.yml@main with: diff --git a/test/test_models.py b/test/test_models.py index e1a288f4eb5..91aa66c667e 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -682,10 +682,11 @@ def test_classification_model(model_fn, dev): model_name = model_fn.__name__ if SKIP_BIG_MODEL and is_skippable(model_name, dev): pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model") - if model_name == "vit_h_14" and dev == "cuda": - # TODO: investigate why this fail on CI. It doesn't fail on AWS cluster with CUDA 11.6 - # (can't test with later versions ATM) - pytest.xfail("https://github.com/pytorch/vision/issues/7143") + if model_name == "resnet101" and dev == "cuda": + # TODO: Investigate the Failure with CUDA 11.8: https://github.com/pytorch/vision/issues/7618 + # TODO: Investigate/followup on previous failure: https://github.com/pytorch/vision/issues/7143 + # its not happening on CI with CUDA 11.8 anymore. Follow up is needed if its still not resolved. + pytest.xfail("https://github.com/pytorch/vision/issues/7618") kwargs = {**defaults, **_model_params.get(model_name, {})} num_classes = kwargs.get("num_classes") input_shape = kwargs.pop("input_shape")