Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .azure/gpu-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- job: benchmarks
timeoutInMinutes: "90"
cancelTimeoutInMinutes: "2"
pool: azure-jirka-spot
pool: azure-gpus-spot
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
Expand All @@ -46,5 +46,6 @@ jobs:
- bash: python -m pytest benchmarks -v --durations=0
env:
PL_RUNNING_BENCHMARKS: 1
CUDA_LAUNCH_BLOCKING: 1
workingDirectory: tests/tests_pytorch
displayName: 'Testing: PyTorch benchmarks'
12 changes: 9 additions & 3 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
timeoutInMinutes: "100"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: azure-jirka-spot
pool: azure-gpus-spot
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
Expand Down Expand Up @@ -65,7 +65,7 @@ jobs:
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
python requirements/pytorch/check-avail-strategies.py
python requirements/pytorch/check-avail-extras.py
displayName: 'Env details'
displayName: 'Env. details'

- bash: bash .actions/pull_legacy_checkpoints.sh
displayName: 'Get legacy checkpoints'
Expand All @@ -74,14 +74,19 @@ jobs:
workingDirectory: src/pytorch_lightning
displayName: 'Testing: PyTorch doctests'

- bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
- bash: |
set -eo pipefail
python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
displayName: 'Testing: PyTorch standard'
env:
CUDA_LAUNCH_BLOCKING: 1
workingDirectory: tests/tests_pytorch

- bash: bash run_standalone_tests.sh
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
CUDA_LAUNCH_BLOCKING: 1
displayName: 'Testing: PyTorch standalone tests'

- bash: |
Expand Down Expand Up @@ -109,6 +114,7 @@ jobs:
workingDirectory: examples
env:
PL_USE_MOCKED_MNIST: "1"
CUDA_LAUNCH_BLOCKING: 1
displayName: 'Testing: PyTorch examples'

- bash: python -m pytest benchmarks -v --maxfail=2 --durations=0
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci-pytorch_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ jobs:
matrix:
include:
# the config used in '.azure-pipelines/gpu-tests.yml'
- {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
- {python_version: "3.7", pytorch_version: "1.11", cuda_version: "10.2", ubuntu_version: "18.04"}
- {python_version: "3.8", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
# latest (used in Tutorials)
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
Expand Down
11 changes: 6 additions & 5 deletions .github/workflows/events-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
schedule:
# At the end of every day
- cron: "0 0 * * *"
push: {} # fixme

env:
PUSH_TO_HUB: true
Expand Down Expand Up @@ -68,7 +69,7 @@ jobs:
matrix:
# the config used in '.circleci/config.yml`'
python_version: ["3.7"]
xla_version: ["1.8"]
xla_version: ["1.11"]

steps:
- name: Checkout
Expand Down Expand Up @@ -114,9 +115,9 @@ jobs:
fail-fast: false
matrix:
include:
# the config used in '.azure-pipelines/gpu-tests.yml'
- {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
# the config used in '.azure/gpu-tests.yml'
- {python_version: "3.7", pytorch_version: "1.9", cuda_version: "10.2", ubuntu_version: "18.04"}
- {python_version: "3.8", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
# latest (used in Tutorials)
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
Expand All @@ -143,7 +144,7 @@ jobs:
UBUNTU_VERSION=${{ matrix.ubuntu_version }}
file: dockers/base-cuda/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
tags: pytorchlightning/pytorch_lightning:base-cuda${{ matrix.cuda_version }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
timeout-minutes: 95

# report failure to Slack
Expand Down
4 changes: 2 additions & 2 deletions dockers/tpu-tests/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG PYTHON_VERSION=3.9
ARG PYTORCH_VERSION=1.9
ARG PYTHON_VERSION=3.7
ARG PYTORCH_VERSION=1.11

FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}

Expand Down
6 changes: 3 additions & 3 deletions tests/tests_pytorch/models/test_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def test_callbacks_references_fit_ckpt_path(tmpdir):
trainer.fit(model, datamodule=dm, ckpt_path=str(tmpdir / "last.ckpt"))


@RunIf(min_cuda_gpus=2)
@RunIf(min_cuda_gpus=2, standalone=True)
def test_running_test_pretrained_model_distrib_dp(tmpdir):
"""Verify `test()` on pretrained model."""

Expand Down Expand Up @@ -424,7 +424,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
tpipes.run_model_prediction(pretrained_model, dataloader)


@RunIf(min_cuda_gpus=2)
@RunIf(min_cuda_gpus=2, standalone=True)
def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
"""Verify `test()` on pretrained model."""
tutils.set_random_main_port()
Expand Down Expand Up @@ -558,7 +558,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
new_trainer.test(pretrained_model)


@RunIf(min_cuda_gpus=2)
@RunIf(min_cuda_gpus=2, standalone=True)
def test_dp_resume(tmpdir):
"""Make sure DP continues training correctly."""
model = CustomClassificationModelDP(lr=0.1)
Expand Down
6 changes: 4 additions & 2 deletions tests/tests_pytorch/strategies/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@
"trainer_kwargs",
(
pytest.param(dict(accelerator="gpu", devices=1), marks=RunIf(min_cuda_gpus=1)),
pytest.param(dict(strategy="dp", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)),
pytest.param(dict(strategy="ddp_spawn", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2)),
pytest.param(dict(strategy="dp", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2, standalone=True)),
pytest.param(
dict(strategy="ddp_spawn", accelerator="gpu", devices=2), marks=RunIf(min_cuda_gpus=2, standalone=True)
),
pytest.param(dict(accelerator="mps", devices=1), marks=RunIf(mps=True)),
),
)
Expand Down
3 changes: 2 additions & 1 deletion tests/tests_pytorch/trainer/test_dataloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def test_dataloader_distributed_sampler_already_attached(tmpdir):
assert trainer.state.finished, "DDP Training failed"


@RunIf(min_cuda_gpus=3)
@RunIf(min_cuda_gpus=3, standalone=True)
def test_batch_size_smaller_than_num_gpus(tmpdir):
# we need at least 3 gpus for this test
num_gpus = 3
Expand Down Expand Up @@ -869,6 +869,7 @@ def train_dataloader(self):
limit_train_batches=0.1,
limit_val_batches=0,
accelerator="gpu",
strategy="ddp",
devices=num_gpus,
)

Expand Down