From ebec339f9c6a8369d5515e2ee0c6dcbdcc8338f2 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 20:58:39 +0900 Subject: [PATCH 01/50] Remove conda job --- .github/workflows/ci-pytorch-test-conda.yml | 130 -------------------- 1 file changed, 130 deletions(-) delete mode 100644 .github/workflows/ci-pytorch-test-conda.yml diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml deleted file mode 100644 index 64d06a22949d8..0000000000000 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ /dev/null @@ -1,130 +0,0 @@ -name: Test PyTorch with Conda - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -defaults: - run: - shell: bash -l {0} - -jobs: - pl-conda: - runs-on: ubuntu-20.04 - container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} - strategy: - fail-fast: false - matrix: - include: - - {python-version: "3.8", pytorch-version: "1.9"} - - {python-version: "3.8", pytorch-version: "1.10"} - - {python-version: "3.9", pytorch-version: "1.11"} - - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 40 - - steps: - - name: Workaround for https://github.com/actions/checkout/issues/760 - run: git config --global --add safe.directory /__w/lightning/lightning - - - uses: actions/checkout@v3 - - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v29.0.3 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - - name: Update base dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - run: | - conda info - conda list - pip install -e .[test] - - - name: Freeze PIL (hotfix) - # import of PILLOW_VERSION which they recently removed in v9.0 in favor of __version__ - run: pip install "Pillow<9.0" # It messes with torchvision - - - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} - working-directory: ./src - run: pytest pytorch_lightning --cov=pytorch_lightning - - - name: Update all dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} - env: - HOROVOD_BUILD_ARCH_FLAGS: "-mfma" - HOROVOD_WITHOUT_MXNET: 1 - HOROVOD_WITHOUT_TENSORFLOW: 1 - run: | - set -e - pip list - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/torch_stable.html - pip install -r requirements/pytorch/strategies.txt - # set a per-test timeout of 2.5 minutes to fail sooner; this aids with hanging tests - pip install pytest-timeout - pip list - # sanity check - python requirements/pytorch/check-avail-extras.py - - - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} - run: bash .actions/pull_legacy_checkpoints.sh - - - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} - working-directory: tests/tests_pytorch - run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - - - name: Upload pytest results - uses: actions/upload-artifact@v3 - with: - name: unittest-results-${{ runner.os }}-torch${{ matrix.pytorch-version }} - path: tests/tests_pytorch/results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - if: failure() - - - name: Statistics - if: ${{ success() && (steps.skip.outputs.continue == '1') }} - working-directory: tests/tests_pytorch - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - if: ${{ success() && (steps.skip.outputs.continue == '1') }} - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_pytorch/coverage.xml - flags: cpu,pytest,torch${{ matrix.pytorch-version }} - name: CPU-coverage - fail_ci_if_error: false From 2a972194374a6924b7432d55cd195f45c5e66d2d Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 20:59:39 +0900 Subject: [PATCH 02/50] Remove conda job from readme --- .github/workflows/README.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 4ed903c0f3a93..c5a18dad37562 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,16 +4,15 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------- | ------------------- | -| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | -| Test PyTorch with Conda | .github/workflows/ci-pytorch-test-conda.yml | Same as ci-pytorch-test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.9), (3.8, 1.10), (3.8, 1.11), (3.9, 1.12) | linux | -| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | -| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | +| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | +| -------------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | +| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | +| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | - \*Accelerators used in CI - GPU: 2 x NVIDIA Tesla V100 From f8be7ad43d26a8ad2169a6252f2871caab31a061 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 21:02:56 +0900 Subject: [PATCH 03/50] Remove conda jobs from checkgroup --- .github/checkgroup.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index e8892926f6e55..1b5b7b7fe0709 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -28,10 +28,6 @@ subprojects: - ".github/workflows/ci-pytorch*.yml" - ".github/workflows/docs-*.yml" checks: - - "pl-conda (3.8, 1.10)" - - "pl-conda (3.8, 1.9)" - - "pl-conda (3.9, 1.11)" - - "pl-conda (3.9, 1.12)" - "pl-cpu (macOS-11, 3.10, latest, stable)" - "pl-cpu (macOS-11, 3.7, latest, stable)" - "pl-cpu (macOS-11, 3.7, oldest, stable)" @@ -108,10 +104,6 @@ subprojects: - ".github/workflows/*docker*.yml" - "setup.py" checks: - - "build-conda (3.8, 1.9, 11.1.1)" - - "build-conda (3.8, 1.10.1, 11.1.1)" - - "build-conda (3.9, 1.11, 11.3.1)" - - "build-conda (3.9, 1.12, 11.3.1)" - "build-cuda (3.8, 1.9, 11.1.1)" - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" From d62c7cb4808f59504bd2c3107a04117e9057f278 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 21:04:43 +0900 Subject: [PATCH 04/50] Remove conda from docker builds --- .github/workflows/ci-pytorch-dockers.yml | 45 ++---------------------- 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 73f303c6cbc44..8c6509576460c 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -127,44 +127,6 @@ jobs: env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - build-conda: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10.1", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} - steps: - - uses: actions/checkout@v3 - - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v2 - if: env.PUSH_TO_HUB == 'true' - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v3 - with: - build-args: | - PYTHON_VERSION=${{ matrix.python_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} - file: dockers/base-conda/Dockerfile - push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - timeout-minutes: 95 - - uses: ravsamhq/notify-slack-action@v2 - if: failure() && env.PUSH_TO_HUB == 'true' - with: - status: ${{ job.status }} - token: ${{ secrets.GITHUB_TOKEN }} - notification_title: ${{ format('Conda; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01A5T7EY9M>' # akihironitta - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - build-ipu: runs-on: ubuntu-20.04 strategy: @@ -248,11 +210,8 @@ jobs: build-NGC: runs-on: ubuntu-20.04 steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Build Conda Docker - # publish master/release - uses: docker/build-push-action@v3 + - uses: actions/checkout@v3 + - uses: docker/build-push-action@v3 with: file: dockers/nvidia/Dockerfile push: false From 18c7e64892cca183be42b8d7dc8b1bb7fc4c4345 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 21:05:30 +0900 Subject: [PATCH 05/50] Remove base-conda dockerfile --- dockers/base-conda/Dockerfile | 160 ---------------------------------- 1 file changed, 160 deletions(-) delete mode 100644 dockers/base-conda/Dockerfile diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile deleted file mode 100644 index c82c5a4dfa15f..0000000000000 --- a/dockers/base-conda/Dockerfile +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -ARG CUDA_VERSION=11.3.1 - -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 - -ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.9 -ARG CONDA_VERSION=4.11.0 - -SHELL ["/bin/bash", "-c"] -# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ -ENV \ - PATH="$PATH:/root/.local/bin" \ - DEBIAN_FRONTEND=noninteractive \ - TZ=Europe/Prague \ - # CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ - MKL_THREADING_LAYER=GNU - -RUN \ - # TODO: Remove the manual key installation once the base image is updated. - # https://github.com/NVIDIA/nvidia-docker/issues/1631 - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ - apt-get update -qq --fix-missing && \ - NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ - CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ - MAX_ALLOWED_NCCL=2.11.4 && \ - TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ - apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - git \ - wget \ - curl \ - unzip \ - ca-certificates \ - libopenmpi-dev \ - libnccl2=$TO_INSTALL_NCCL \ - libnccl-dev=$TO_INSTALL_NCCL && \ -# Install conda and python. -# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ -# Cleaning - apt-get autoremove -y && \ - apt-get clean && \ - rm -rf /root/.cache && \ - rm -rf /var/lib/apt/lists/* - -ENV \ - PATH="/root/miniconda3/bin:$PATH" \ - LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" \ - CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ - MKL_THREADING_LAYER=GNU \ - # MAKEFLAGS="-j$(nproc)" \ - MAKEFLAGS="-j2" \ - TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \ - CONDA_ENV=lightning - -COPY environment.yml environment.yml - -# conda init -RUN \ - conda update -n base -c defaults conda && \ - CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ - conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \ - -c nvidia -c pytorch -c pytorch-test && \ - conda init bash && \ - # NOTE: this requires that the channel is presented in the yaml before packages \ - printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ - python prune.py && \ - rm prune.py && \ - cat environment.yml && \ - conda env update --name $CONDA_ENV --file environment.yml && \ - conda install "Pillow<9.0" && \ - conda clean -ya && \ - rm environment.yml - -ENV \ - PATH=/root/miniconda3/envs/${CONDA_ENV}/bin:$PATH \ - LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" - -COPY ./requirements/pytorch/ ./requirements/pytorch/ -COPY ./.actions/assistant.py assistant.py - -RUN \ - pip list | grep torch && \ - python -c "import torch; print(torch.__version__)" && \ - pip install -q fire && \ - python assistant.py requirements_prune_pkgs torch,torchvision && \ - # Install remaining requirements - pip install --no-cache-dir -r requirements/pytorch/base.txt \ - -r requirements/pytorch/extra.txt \ - -r requirements/pytorch/examples.txt && \ - rm assistant.py - -ENV \ - # if you want this environment to be the default o \ne, uncomment the following line: - CONDA_DEFAULT_ENV=${CONDA_ENV} \ - HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ - HOROVOD_GPU_OPERATIONS=NCCL \ - HOROVOD_WITH_PYTORCH=1 \ - HOROVOD_WITHOUT_TENSORFLOW=1 \ - HOROVOD_WITHOUT_MXNET=1 \ - HOROVOD_WITH_GLOO=1 \ - HOROVOD_WITH_MPI=1 - -RUN \ - HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ - export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ - pip install --no-cache-dir -r requirements/pytorch/strategies.txt - -RUN \ - CUDA_VERSION_MAJOR=$(python -c "import torch ; print(torch.version.cuda.split('.')[0])") && \ - py_ver=$(python -c "print(int('$PYTHON_VERSION'.split('.') >= '3.9'.split('.')))") && \ - # install DALI, needed for examples - # todo: waiting for 1.4 - https://github.com/NVIDIA/DALI/issues/3144#issuecomment-877386691 - if [ $py_ver -eq "0" ]; then \ - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda${CUDA_VERSION_MAJOR}0>1.0" ; \ - python -c 'from nvidia.dali.pipeline import Pipeline' ; \ - fi - -RUN \ - # install NVIDIA apex - pip install --no-cache-dir --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \ - python -c "from apex import amp" - -RUN \ - # install Bagua - CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ - pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ - if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ - python -c "import bagua; print(bagua.__version__)" - -RUN \ - # Show what we have - pip --version && \ - conda info && \ - pip list && \ - python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ - python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python requirements/pytorch/check-avail-extras.py && \ - python requirements/pytorch/check-avail-strategies.py && \ - rm -rf requirements/ From 6c29b6ddc68816c83728843baf924f5fa32e99c4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 21:55:57 +0900 Subject: [PATCH 06/50] Rewrite the strategy matrix while keeping equivalent --- .github/workflows/ci-pytorch-test-full.yml | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index fbdc81b91c0ed..09f37757fde47 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,17 +20,20 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2022, macOS-11] - python-version: ["3.7", "3.10"] # minimum, maximum - requires: ["oldest", "latest"] - release: ["stable"] - exclude: - # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. - # TODO: Remove the exclusion when dropping PyTorch 1.9 support. - - {python-version: "3.10", requires: "oldest"} - # TODO: re-enable RC testing - # include: - # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} + # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. + # TODO: Add {python-version: "3.10", requires: "oldest"} back when dropping PyTorch 1.9 support. + include: + - {os: "ubuntu-20.04", python-version: "3.7", requires: "oldest", release: "stable"} + - {os: "ubuntu-20.04", python-version: "3.7", requires: "latest", release: "stable"} + - {os: "ubuntu-20.04", python-version: "3.10", requires: "latest", release: "stable"} + - {os: "macos-11", python-version: "3.7", requires: "oldest", release: "stable"} + - {os: "macos-11", python-version: "3.7", requires: "latest", release: "stable"} + - {os: "macos-11", python-version: "3.10", requires: "latest", release: "stable"} + - {os: "windows-2022", python-version: "3.7", requires: "oldest", release: "stable"} + - {os: "windows-2022", python-version: "3.7", requires: "latest", release: "stable"} + - {os: "windows-2022", python-version: "3.10", requires: "latest", release: "stable"} + # Note: enable testing with release candidate as needed + # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} timeout-minutes: 40 From 0f4c8078fc541e82f5ed1167f8d90c5843737e6c Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 22:05:21 +0900 Subject: [PATCH 07/50] Run the workflow on this branch --- .github/workflows/ci-pytorch-test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 09f37757fde47..40c572d4d6f4e 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -3,7 +3,7 @@ name: Test PyTorch full # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] + branches: [master, "release/*", "ci/combine-conda-full"] # FIXME pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] From 2dfa77424766bc51e4aee8059699431cce894fce Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 22:13:10 +0900 Subject: [PATCH 08/50] Revert "Rewrite the strategy matrix while keeping equivalent" This reverts commit e54298d60e57cffbf8107890987be3fe4a006c77. --- .github/workflows/ci-pytorch-test-full.yml | 25 ++++++++++------------ 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 40c572d4d6f4e..2e2a90f955e49 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,20 +20,17 @@ jobs: strategy: fail-fast: false matrix: - # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. - # TODO: Add {python-version: "3.10", requires: "oldest"} back when dropping PyTorch 1.9 support. - include: - - {os: "ubuntu-20.04", python-version: "3.7", requires: "oldest", release: "stable"} - - {os: "ubuntu-20.04", python-version: "3.7", requires: "latest", release: "stable"} - - {os: "ubuntu-20.04", python-version: "3.10", requires: "latest", release: "stable"} - - {os: "macos-11", python-version: "3.7", requires: "oldest", release: "stable"} - - {os: "macos-11", python-version: "3.7", requires: "latest", release: "stable"} - - {os: "macos-11", python-version: "3.10", requires: "latest", release: "stable"} - - {os: "windows-2022", python-version: "3.7", requires: "oldest", release: "stable"} - - {os: "windows-2022", python-version: "3.7", requires: "latest", release: "stable"} - - {os: "windows-2022", python-version: "3.10", requires: "latest", release: "stable"} - # Note: enable testing with release candidate as needed - # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} + os: [ubuntu-20.04, windows-2022, macOS-11] + python-version: ["3.7", "3.10"] # minimum, maximum + requires: ["oldest", "latest"] + release: ["stable"] + exclude: + # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. + # TODO: Remove the exclusion when dropping PyTorch 1.9 support. + - {python-version: "3.10", requires: "oldest"} + # TODO: re-enable RC testing + # include: + # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} timeout-minutes: 40 From bbf3eb7de9fcc03e190e482dc1746388c0e94968 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 22:18:48 +0900 Subject: [PATCH 09/50] Add PyTorch versions --- .github/workflows/ci-pytorch-test-full.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 2e2a90f955e49..2c1bcae470971 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -22,9 +22,12 @@ jobs: matrix: os: [ubuntu-20.04, windows-2022, macOS-11] python-version: ["3.7", "3.10"] # minimum, maximum + pytorch-version: ["1.9", "1.10", "1.11", "1.12"] requires: ["oldest", "latest"] release: ["stable"] exclude: + - {pytorch-version: "1.10", requires: "oldest"} + - {pytorch-version: "1.11", requires: "oldest"} # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. # TODO: Remove the exclusion when dropping PyTorch 1.9 support. - {python-version: "3.10", requires: "oldest"} From eb5dc5e6bd07ba801eea34111052e7d31701fddc Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 22:28:01 +0900 Subject: [PATCH 10/50] Run on draft and disable unrelated costly CI --- .azure/gpu-tests.yml | 4 +--- .azure/ipu-tests.yml | 4 +--- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index e53d8f07567ff..e113633e8e51b 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -13,9 +13,7 @@ trigger: - "release/*" - "refs/tags/*" -pr: - - "master" - - "release/*" +pr: none # FIXME variables: - name: continue diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml index a4d68318834a6..c5321da70769a 100644 --- a/.azure/ipu-tests.yml +++ b/.azure/ipu-tests.yml @@ -8,9 +8,7 @@ trigger: - release/* - refs/tags/* -pr: - - master - - release/* +pr: none # FIXME variables: - name: poplar_sdk diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 2c1bcae470971..12503fc0b795f 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -16,7 +16,7 @@ jobs: pl-cpu: runs-on: ${{ matrix.os }} - if: github.event.pull_request.draft == false + # if: github.event.pull_request.draft == false # FIXME strategy: fail-fast: false matrix: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 091c3f606c3ca..68857e6f908f4 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -21,7 +21,7 @@ concurrency: jobs: pl-slow: runs-on: ${{ matrix.os }} - if: github.event.pull_request.draft == false + # if: github.event.pull_request.draft == false # FIXME strategy: fail-fast: false matrix: From 3926002874d50d5a88d3f0700e9144da0a78cccc Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 22:28:14 +0900 Subject: [PATCH 11/50] Revert "Run the workflow on this branch" This reverts commit 51ed8b905d8926b630dce4817124bd486135d3ec. --- .github/workflows/ci-pytorch-test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 12503fc0b795f..be37efd2d21e6 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -3,7 +3,7 @@ name: Test PyTorch full # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*", "ci/combine-conda-full"] # FIXME + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] From 940468069f7c254d39541f010bc6d7584ddc221a Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 22:36:29 +0900 Subject: [PATCH 12/50] tmp: Lightweight relevant CI --- .github/workflows/ci-pytorch-test-full.yml | 112 ++++++++++----------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index be37efd2d21e6..11dd2445cb89b 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -108,9 +108,9 @@ jobs: restore-keys: | ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} - run: bash .actions/pull_legacy_checkpoints.sh + # - name: Pull legacy checkpoints + # if: ${{ (steps.skip.outputs.continue == '1') }} + # run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies if: ${{ (steps.skip.outputs.continue == '1') }} @@ -124,10 +124,10 @@ jobs: pip list shell: bash - - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} - working-directory: ./src - run: pytest pytorch_lightning --cov=pytorch_lightning + # - name: DocTests + # if: ${{ (steps.skip.outputs.continue == '1') }} + # working-directory: ./src + # run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies if: ${{ (steps.skip.outputs.continue == '1') }} @@ -155,57 +155,57 @@ jobs: python -c "import horovod.torch" shell: bash - - name: Cache datasets - if: ${{ (steps.skip.outputs.continue == '1') }} - uses: actions/cache@v3 - with: - path: Datasets - key: pl-dataset + # - name: Cache datasets + # if: ${{ (steps.skip.outputs.continue == '1') }} + # uses: actions/cache@v3 + # with: + # path: Datasets + # key: pl-dataset - name: Sanity check if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py - - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} - working-directory: tests/tests_pytorch - # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 - run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - - name: Upload pytest results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} - uses: actions/upload-artifact@v3 - with: - name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} - path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - - name: Prepare Examples - if: ${{ (steps.skip.outputs.continue == '1') }} - run: | - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - - - name: Run Examples - if: ${{ (steps.skip.outputs.continue == '1') }} - working-directory: ./examples - run: python -m pytest test_pl_examples.py -v --durations=10 - - - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} - working-directory: tests/tests_pytorch - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_pytorch/coverage.xml - flags: cpu,pytest,python${{ matrix.python-version }} - name: CPU-coverage - fail_ci_if_error: false + # - name: Testing PyTorch + # if: ${{ (steps.skip.outputs.continue == '1') }} + # working-directory: tests/tests_pytorch + # # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 + # run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + # - name: Upload pytest results + # if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + # uses: actions/upload-artifact@v3 + # with: + # name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} + # path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + # - name: Prepare Examples + # if: ${{ (steps.skip.outputs.continue == '1') }} + # run: | + # # adjust versions according installed Torch version + # python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt + # pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade + + # - name: Run Examples + # if: ${{ (steps.skip.outputs.continue == '1') }} + # working-directory: ./examples + # run: python -m pytest test_pl_examples.py -v --durations=10 + + # - name: Statistics + # if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + # working-directory: tests/tests_pytorch + # run: | + # coverage report + # coverage xml + + # - name: Upload coverage to Codecov + # uses: codecov/codecov-action@v3 + # if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} + # # see: https://github.com/actions/toolkit/issues/399 + # continue-on-error: true + # with: + # token: ${{ secrets.CODECOV_TOKEN }} + # file: tests/tests_pytorch/coverage.xml + # flags: cpu,pytest,python${{ matrix.python-version }} + # name: CPU-coverage + # fail_ci_if_error: false From f7bc4ca476bfeae4c2f443b600f4df79765c8e1b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Aug 2022 22:36:56 +0900 Subject: [PATCH 13/50] Fix CI pathfilter --- .github/workflows/ci-pytorch-test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 11dd2445cb89b..71a7d16ec30e8 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -48,7 +48,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-full.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES From 4ed163d4c787d7bef351104af4ab84e07c3e2968 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 7 Sep 2022 02:32:23 +0900 Subject: [PATCH 14/50] Update matrix --- .github/workflows/ci-pytorch-test-full.yml | 32 ++++++++++++---------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 71a7d16ec30e8..0eeffc56746e4 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,20 +20,24 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2022, macOS-11] - python-version: ["3.7", "3.10"] # minimum, maximum - pytorch-version: ["1.9", "1.10", "1.11", "1.12"] - requires: ["oldest", "latest"] - release: ["stable"] - exclude: - - {pytorch-version: "1.10", requires: "oldest"} - - {pytorch-version: "1.11", requires: "oldest"} - # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. - # TODO: Remove the exclusion when dropping PyTorch 1.9 support. - - {python-version: "3.10", requires: "oldest"} - # TODO: re-enable RC testing - # include: - # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} + include: + - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} + - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} + - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} + - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} + - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} + - {os: "macos-11", python-version: "3.8", pytorch-version: "1.10"} + - {os: "macos-11", python-version: "3.9", pytorch-version: "1.11"} + - {os: "macos-11", python-version: "3.10", pytorch-version: "1.12"} + - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.12"} + - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.10"} + - {os: "windows-2022", python-version: "3.9", pytorch-version: "1.11"} + - {os: "windows-2022", python-version: "3.10", pytorch-version: "1.12"} + # TODO: re-enable RC testing + # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} timeout-minutes: 40 From 56a4474c6358b552d6ad71d8eedbc3bf87bb322b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 7 Sep 2022 03:18:42 +0900 Subject: [PATCH 15/50] Drop skipping logic --- .github/workflows/ci-pytorch-test-conda.yml | 24 +---------- .github/workflows/ci-pytorch-test-full.yml | 44 ++++----------------- 2 files changed, 9 insertions(+), 59 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 64d06a22949d8..c11f939902167 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -39,22 +39,6 @@ jobs: id: changed-files uses: tj-actions/changed-files@v29.0.3 - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Update base dependencies if: ${{ (steps.skip.outputs.continue == '1') }} env: @@ -70,12 +54,10 @@ jobs: run: pip install "Pillow<9.0" # It messes with torchvision - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Update all dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -95,11 +77,9 @@ jobs: python requirements/pytorch/check-avail-extras.py - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml @@ -111,7 +91,7 @@ jobs: if: failure() - name: Statistics - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -119,7 +99,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index fbdc81b91c0ed..aaf114b765093 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -41,63 +41,42 @@ jobs: id: changed-files uses: tj-actions/changed-files@v29.0.3 - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Set up Python ${{ matrix.python-version }} - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Reset caching - if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: basic setup - if: ${{ (steps.skip.outputs.continue == '1') }} run: | pip --version pip install -q fire # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS - if: ${{ (runner.os == 'macOS') && (steps.skip.outputs.continue == '1') }} + if: runner.os == 'macOS' run: | brew install openmpi libuv # Horovod on macOS requires OpenMPI, Gloo not currently supported - name: Setup Windows - if: ${{ (runner.os == 'windows') && (steps.skip.outputs.continue == '1') }} + if: runner.os == 'windows' run: | python .actions/assistant.py requirements_prune_pkgs horovod - name: Set min. dependencies - if: ${{ (matrix.requires == 'oldest') && (steps.skip.outputs.continue == '1') }} + if: matrix.requires == 'oldest' run: | python .actions/assistant.py replace_oldest_ver # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Get pip cache dir - if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -106,11 +85,9 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -122,12 +99,10 @@ jobs: shell: bash - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt @@ -136,7 +111,7 @@ jobs: shell: bash - name: Reinstall Horovod if necessary - if: ${{ (runner.os != 'windows') && (steps.skip.outputs.continue == '1') }} + if: runner.os != 'windows' env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -153,43 +128,38 @@ jobs: shell: bash - name: Cache datasets - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: Datasets key: pl-dataset - name: Sanity check - if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Upload pytest results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + if: failure() uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Prepare Examples - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - name: Run Examples - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -197,7 +167,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} + if: always() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: From 663bd2c204d04c5dce090d959f91e5cbc7a35799 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 7 Sep 2022 22:11:31 +0900 Subject: [PATCH 16/50] pip list --- .github/workflows/ci-pytorch-test-full.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 73a8a47984952..5388fa83c11f9 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -117,6 +117,8 @@ jobs: pip list shell: bash + - uses: Lightning-AI/utilities/.github/actions/pip-list@ci/pip-list + - name: Reinstall Horovod if necessary if: runner.os != 'windows' env: From ab9bc82d6a375ee368ca0e30fb64e430f531940a Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 7 Sep 2022 22:17:25 +0900 Subject: [PATCH 17/50] reorder pip list --- .github/workflows/ci-pytorch-test-full.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 5388fa83c11f9..b72caef939ba8 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -117,8 +117,6 @@ jobs: pip list shell: bash - - uses: Lightning-AI/utilities/.github/actions/pip-list@ci/pip-list - - name: Reinstall Horovod if necessary if: runner.os != 'windows' env: @@ -136,6 +134,8 @@ jobs: python -c "import horovod.torch" shell: bash + - uses: Lightning-AI/utilities/.github/actions/pip-list@ci/pip-list + - name: Cache datasets uses: actions/cache@v3 with: From b0b438ea3671d1f836d821fde150f440ce70ec9b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 7 Sep 2022 22:31:24 +0900 Subject: [PATCH 18/50] tmp: lightweight ci --- .github/workflows/ci-pytorch-test-full.yml | 128 ++++++++++----------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index b72caef939ba8..743e44ead47a8 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -77,22 +77,22 @@ jobs: run: | python .actions/assistant.py replace_oldest_ver - # Note: This uses an internal pip API and may not always work - # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - - name: Get pip cache dir - id: pip-cache - run: echo "::set-output name=dir::$(pip cache dir)" - - - name: pip cache - uses: actions/cache@v3 - with: - path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements/pytorch/*.txt') }} - restore-keys: | - ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - - - name: Pull legacy checkpoints - run: bash .actions/pull_legacy_checkpoints.sh + # # Note: This uses an internal pip API and may not always work + # # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow + # - name: Get pip cache dir + # id: pip-cache + # run: echo "::set-output name=dir::$(pip cache dir)" + + # - name: pip cache + # uses: actions/cache@v3 + # with: + # path: ${{ steps.pip-cache.outputs.dir }} + # key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements/pytorch/*.txt') }} + # restore-keys: | + # ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- + + # - name: Pull legacy checkpoints + # run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies env: @@ -105,9 +105,9 @@ jobs: pip list shell: bash - - name: DocTests - working-directory: ./src - run: pytest pytorch_lightning --cov=pytorch_lightning + # - name: DocTests + # working-directory: ./src + # run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies run: | @@ -136,52 +136,52 @@ jobs: - uses: Lightning-AI/utilities/.github/actions/pip-list@ci/pip-list - - name: Cache datasets - uses: actions/cache@v3 - with: - path: Datasets - key: pl-dataset + # - name: Cache datasets + # uses: actions/cache@v3 + # with: + # path: Datasets + # key: pl-dataset - name: Sanity check run: python requirements/pytorch/check-avail-extras.py - - name: Testing PyTorch - working-directory: tests/tests_pytorch - # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 - run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - - name: Upload pytest results - if: failure() - uses: actions/upload-artifact@v3 - with: - name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} - path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - - name: Prepare Examples - run: | - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - - - name: Run Examples - working-directory: ./examples - run: python -m pytest test_pl_examples.py -v --durations=10 - - - name: Statistics - if: success() - working-directory: tests/tests_pytorch - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - if: always() - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_pytorch/coverage.xml - flags: cpu,pytest,python${{ matrix.python-version }} - name: CPU-coverage - fail_ci_if_error: false + # - name: Testing PyTorch + # working-directory: tests/tests_pytorch + # # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 + # run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + # - name: Upload pytest results + # if: failure() + # uses: actions/upload-artifact@v3 + # with: + # name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} + # path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + # - name: Prepare Examples + # run: | + # # adjust versions according installed Torch version + # python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt + # pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade + + # - name: Run Examples + # working-directory: ./examples + # run: python -m pytest test_pl_examples.py -v --durations=10 + + # - name: Statistics + # if: success() + # working-directory: tests/tests_pytorch + # run: | + # coverage report + # coverage xml + + # - name: Upload coverage to Codecov + # uses: codecov/codecov-action@v3 + # if: always() + # # see: https://github.com/actions/toolkit/issues/399 + # continue-on-error: true + # with: + # token: ${{ secrets.CODECOV_TOKEN }} + # file: tests/tests_pytorch/coverage.xml + # flags: cpu,pytest,python${{ matrix.python-version }} + # name: CPU-coverage + # fail_ci_if_error: false From c22b96def99d0a9095caaa252c7acfe280da594f Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 8 Sep 2022 02:06:22 +0900 Subject: [PATCH 19/50] Install specified pytorch --- .github/workflows/ci-pytorch-test-full.yml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index d876e6fdd2c07..273a42938aee9 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -99,9 +99,15 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 run: | + # install PyTorch flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) - url=$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) - pip install -e .[test] --upgrade $flag --find-links "https://download.pytorch.org/whl/${url}" + url=https://download.pytorch.org/whl/$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) + pip install torch==${{ matrix.pytorch-version }} $flag -f ${url} + # adjust PyTorch versions in requirements files according to the installed PyTorch version + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt + # install PL and optional dependencies for testing + pip install -e . -r requirements/pytorch/test.txt -r ./requirements/pytorch/extra.txt -f ${url} pip list shell: bash @@ -109,14 +115,6 @@ jobs: # working-directory: ./src # run: pytest pytorch_lightning --cov=pytorch_lightning - - name: Install extra dependencies - run: | - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt - pip install -r ./requirements/pytorch/extra.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - pip list - shell: bash - - name: Reinstall Horovod if necessary if: runner.os != 'windows' env: From c582ea2b379b785e4c6073de828ea4347807e747 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 8 Sep 2022 10:04:43 +0900 Subject: [PATCH 20/50] Fix torch installation --- .github/workflows/ci-pytorch-test-full.yml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 273a42938aee9..78ad758752475 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -99,15 +99,13 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 run: | - # install PyTorch - flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) - url=https://download.pytorch.org/whl/$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) - pip install torch==${{ matrix.pytorch-version }} $flag -f ${url} - # adjust PyTorch versions in requirements files according to the installed PyTorch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt + # adjust PyTorch versions in requirements files + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${{ matrix.pytorch-version }} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${{ matrix.pytorch-version }} # install PL and optional dependencies for testing - pip install -e . -r requirements/pytorch/test.txt -r ./requirements/pytorch/extra.txt -f ${url} + pre_option=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) + url=https://download.pytorch.org/whl/$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) + pip install -e . -r requirements/pytorch/test.txt -r ./requirements/pytorch/extra.txt $pre_option -f ${url} pip list shell: bash From 79c07c31e06809ecc9d7ac2b633d13a35a12ecd6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 8 Sep 2022 10:12:42 +0900 Subject: [PATCH 21/50] Uncomment steps --- .github/workflows/ci-pytorch-test-full.yml | 131 ++++++++++----------- 1 file changed, 61 insertions(+), 70 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 78ad758752475..31a3879c815d7 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -61,7 +61,6 @@ jobs: pip --version pip install -q fire - # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS if: runner.os == 'macOS' run: | @@ -77,22 +76,20 @@ jobs: run: | python .actions/assistant.py replace_oldest_ver - # # Note: This uses an internal pip API and may not always work - # # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - # - name: Get pip cache dir - # id: pip-cache - # run: echo "::set-output name=dir::$(pip cache dir)" + - name: Get pip cache dir + id: pip-cache + run: echo "::set-output name=dir::$(pip cache dir)" - # - name: pip cache - # uses: actions/cache@v3 - # with: - # path: ${{ steps.pip-cache.outputs.dir }} - # key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements/pytorch/*.txt') }} - # restore-keys: | - # ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- + - name: pip cache + uses: actions/cache@v3 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements/pytorch/*.txt') }} + restore-keys: | + ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - # - name: Pull legacy checkpoints - # run: bash .actions/pull_legacy_checkpoints.sh + - name: Pull legacy checkpoints + run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies env: @@ -109,9 +106,9 @@ jobs: pip list shell: bash - # - name: DocTests - # working-directory: ./src - # run: pytest pytorch_lightning --cov=pytorch_lightning + - name: DocTests + working-directory: ./src + run: pytest pytorch_lightning --cov=pytorch_lightning - name: Reinstall Horovod if necessary if: runner.os != 'windows' @@ -132,59 +129,53 @@ jobs: - uses: Lightning-AI/utilities/.github/actions/pip-list@ci/pip-list - # - name: Cache datasets - # uses: actions/cache@v3 - # with: - # path: Datasets - # key: pl-dataset + - name: Cache datasets + uses: actions/cache@v3 + with: + path: Datasets + key: pl-dataset - name: Sanity check run: python requirements/pytorch/check-avail-extras.py - # - name: Testing Warnings - # # the stacklevel can only be set on >=3.7 - # if: matrix.python-version != '3.7' - # working-directory: tests/tests_pytorch - # # needs to run outside of `pytest` - # run: python utilities/test_warnings.py - - # - name: Testing PyTorch - # working-directory: tests/tests_pytorch - # # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 - # run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - # - name: Upload pytest results - # if: failure() - # uses: actions/upload-artifact@v3 - # with: - # name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} - # path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - # - name: Prepare Examples - # run: | - # # adjust versions according installed Torch version - # python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - # pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - - # - name: Run Examples - # working-directory: ./examples - # run: python -m pytest test_pl_examples.py -v --durations=10 - - # - name: Statistics - # if: success() - # working-directory: tests/tests_pytorch - # run: | - # coverage report - # coverage xml - - # - name: Upload coverage to Codecov - # uses: codecov/codecov-action@v3 - # if: always() - # # see: https://github.com/actions/toolkit/issues/399 - # continue-on-error: true - # with: - # token: ${{ secrets.CODECOV_TOKEN }} - # file: tests/tests_pytorch/coverage.xml - # flags: cpu,pytest,python${{ matrix.python-version }} - # name: CPU-coverage - # fail_ci_if_error: false + - name: Testing Warnings + # the stacklevel can only be set on >=3.7 + if: matrix.python-version != '3.7' + working-directory: tests/tests_pytorch + # needs to run outside of `pytest` + run: python utilities/test_warnings.py + + - name: Testing PyTorch + working-directory: tests/tests_pytorch + # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 + run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + - name: Upload pytest results + if: failure() + uses: actions/upload-artifact@v3 + with: + name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} + path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + - name: Run Examples + working-directory: ./examples + run: python -m pytest test_pl_examples.py -v --durations=10 + + - name: Statistics + if: success() + working-directory: tests/tests_pytorch + run: | + coverage report + coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + if: always() + # see: https://github.com/actions/toolkit/issues/399 + continue-on-error: true + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: tests/tests_pytorch/coverage.xml + flags: cpu,pytest,python${{ matrix.python-version }} + name: CPU-coverage + fail_ci_if_error: false From 1099a1b8e41aaa6e7871be73082f79c4d81e8706 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 8 Sep 2022 11:08:44 +0900 Subject: [PATCH 22/50] Increase timeout --- .github/workflows/ci-pytorch-test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 31a3879c815d7..16021a17e989c 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -39,7 +39,7 @@ jobs: # TODO: re-enable RC testing # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} - timeout-minutes: 40 + timeout-minutes: 50 steps: - uses: actions/checkout@v3 From 978e69b7fa12409f0e1b98484054c808a99d2b6e Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 8 Sep 2022 11:09:15 +0900 Subject: [PATCH 23/50] bad merge --- .github/workflows/ci-pytorch-test-full.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 16021a17e989c..5f8ed51bb0757 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -44,10 +44,6 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v29.0.3 - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: From ffc515443f06dacb61b467816f3524f3038d84a6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 8 Sep 2022 13:11:21 +0900 Subject: [PATCH 24/50] Revert "Run on draft and disable unrelated costly CI" This reverts commit eb5dc5e6bd07ba801eea34111052e7d31701fddc. --- .azure/gpu-tests.yml | 4 +++- .azure/ipu-tests.yml | 4 +++- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index e113633e8e51b..e53d8f07567ff 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -13,7 +13,9 @@ trigger: - "release/*" - "refs/tags/*" -pr: none # FIXME +pr: + - "master" + - "release/*" variables: - name: continue diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml index c5321da70769a..a4d68318834a6 100644 --- a/.azure/ipu-tests.yml +++ b/.azure/ipu-tests.yml @@ -8,7 +8,9 @@ trigger: - release/* - refs/tags/* -pr: none # FIXME +pr: + - master + - release/* variables: - name: poplar_sdk diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 5f8ed51bb0757..31ea8565413b6 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -16,7 +16,7 @@ jobs: pl-cpu: runs-on: ${{ matrix.os }} - # if: github.event.pull_request.draft == false # FIXME + if: github.event.pull_request.draft == false strategy: fail-fast: false matrix: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 68857e6f908f4..091c3f606c3ca 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -21,7 +21,7 @@ concurrency: jobs: pl-slow: runs-on: ${{ matrix.os }} - # if: github.event.pull_request.draft == false # FIXME + if: github.event.pull_request.draft == false strategy: fail-fast: false matrix: From 1076c751b222d1928ce2a4689d130b11306a83b0 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 9 Sep 2022 06:16:36 +0900 Subject: [PATCH 25/50] Update checkgroup --- .github/checkgroup.yml | 57 ++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 751265376c327..7433f17a08c59 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -31,15 +31,21 @@ subprojects: - "setup.cfg" # includes pytest config - ".github/workflows/docs-*.yml" checks: - - "pl-cpu (macOS-11, 3.10, latest, stable)" - - "pl-cpu (macOS-11, 3.7, latest, stable)" - - "pl-cpu (macOS-11, 3.7, oldest, stable)" - - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "pl-cpu (windows-2022, 3.10, latest, stable)" - - "pl-cpu (windows-2022, 3.7, latest, stable)" - - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" + - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.10)" + - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.10, 1.12)" + - "pl-cpu (macos-11, 3.7, 1.9, oldest)" + - "pl-cpu (macos-11, 3.7, 1.12)" + - "pl-cpu (macos-11, 3.8, 1.10)" + - "pl-cpu (macos-11, 3.9, 1.11)" + - "pl-cpu (macos-11, 3.10, 1.12)" + - "pl-cpu (windows-2022, 3.7, 1.9, oldest)" + - "pl-cpu (windows-2022, 3.7, 1.12)" + - "pl-cpu (windows-2022, 3.8, 1.10)" + - "pl-cpu (windows-2022, 3.9, 1.11)" + - "pl-cpu (windows-2022, 3.10, 1.12)" - "make-doctest (pytorch)" - "make-html (pytorch)" - "mypy" @@ -52,28 +58,25 @@ subprojects: - "pl-slow (windows-2022, 3.7, 1.11)" - "test-on-tpus" - - id: "pytorch_lightning: Conda" - paths: - - ".github/workflows/ci-pytorch-test-conda.yml" - checks: - - "pl-conda (3.8, 1.10)" - - "pl-conda (3.8, 1.9)" - - "pl-conda (3.9, 1.11)" - - "pl-conda (3.9, 1.12)" - - id: "pytorch_lightning: CPU" paths: - ".github/workflows/ci-pytorch-test-full.yml" checks: - - "pl-cpu (macOS-11, 3.10, latest, stable)" - - "pl-cpu (macOS-11, 3.7, latest, stable)" - - "pl-cpu (macOS-11, 3.7, oldest, stable)" - - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "pl-cpu (windows-2022, 3.10, latest, stable)" - - "pl-cpu (windows-2022, 3.7, latest, stable)" - - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" + - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.10)" + - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.10, 1.12)" + - "pl-cpu (macos-11, 3.7, 1.9, oldest)" + - "pl-cpu (macos-11, 3.7, 1.12)" + - "pl-cpu (macos-11, 3.8, 1.10)" + - "pl-cpu (macos-11, 3.9, 1.11)" + - "pl-cpu (macos-11, 3.10, 1.12)" + - "pl-cpu (windows-2022, 3.7, 1.9, oldest)" + - "pl-cpu (windows-2022, 3.7, 1.12)" + - "pl-cpu (windows-2022, 3.8, 1.10)" + - "pl-cpu (windows-2022, 3.9, 1.11)" + - "pl-cpu (windows-2022, 3.10, 1.12)" - id: "pytorch_lightning: Slow" paths: From 8697e012b0274c965600e22f3ddebf5f102e09d2 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 9 Sep 2022 06:22:18 +0900 Subject: [PATCH 26/50] Update docs and remove Python/PyTorch versions --- .github/workflows/README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index c5a18dad37562..c171c1ca4e1a9 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,22 +4,25 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| -------------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | -| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | -| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | +| workflow name | workflow file | action | accelerator\* | +| -------------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | +| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | +| Test PyTorch slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | - \*Accelerators used in CI + - GPU: 2 x NVIDIA Tesla V100 - TPU: Google GKE TPUv3 - IPU: [Colossus MK1 IPU](https://www.graphcore.ai/products/ipu) - HPU: [Intel Habana Gaudi SYS-420GH-TNGR](https://www.supermicro.com/en/products/system/AI/4U/SYS-420GH-TNGR) which has 8 Gaudi accelerators +- To check which versions of Python or PyTorch are used for testing in our CI, see the corresponding workflow files or checkgroup cofig file at [`.github/checkgroup.yml`](../checkgroup.yml). + ## Documentation | workflow file | action | From 622cb19e5183087c39ad0711a5d6c65c1db28aa7 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 9 Sep 2022 09:02:44 +0900 Subject: [PATCH 27/50] Remove pip-list --- .github/workflows/ci-pytorch-test-full.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index b4cc08e0ab0e4..11b99129aeee2 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -121,10 +121,9 @@ jobs: fi horovodrun --check-build python -c "import horovod.torch" + pip list shell: bash - - uses: Lightning-AI/utilities/.github/actions/pip-list@ci/pip-list - - name: Cache datasets uses: actions/cache@v3 with: From 41f597bb4a82dda4feb7eeb9e898ae7942ae2cd2 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 9 Sep 2022 09:06:21 +0900 Subject: [PATCH 28/50] Fail if wrong pytorch version installed --- .github/workflows/ci-pytorch-test-full.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 11b99129aeee2..877ff24eaafc8 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -131,7 +131,9 @@ jobs: key: pl-dataset - name: Sanity check - run: python requirements/pytorch/check-avail-extras.py + run: | + python -c "import torch; assert torch.__version__.startswith('${{ matrix.pytorch-version }}')" + python requirements/pytorch/check-avail-extras.py - name: Testing Warnings # the stacklevel can only be set on >=3.7 From 93108e4a668325304ef0660e22e7838061a40541 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:17:01 +0900 Subject: [PATCH 29/50] Add Python 3.8, PyTorch 1.9 job --- .github/workflows/ci-pytorch-test-full.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index af9edcf9407ef..0981321b2a791 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -23,16 +23,19 @@ jobs: include: - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} + - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} + - {os: "macos-11", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - {os: "macos-11", python-version: "3.8", pytorch-version: "1.10"} - {os: "macos-11", python-version: "3.9", pytorch-version: "1.11"} - {os: "macos-11", python-version: "3.10", pytorch-version: "1.12"} - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.12"} + - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.10"} - {os: "windows-2022", python-version: "3.9", pytorch-version: "1.11"} - {os: "windows-2022", python-version: "3.10", pytorch-version: "1.12"} From fef8842a27f197a39311bca7d4e7818c0405c3b9 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:23:31 +0900 Subject: [PATCH 30/50] tmp: remove azure jobs --- .azure/app-cloud-e2e.yml | 167 ------------------------------------- .azure/gpu-benchmark.yml | 51 ------------ .azure/gpu-tests-lite.yml | 112 ------------------------- .azure/gpu-tests.yml | 170 -------------------------------------- .azure/hpu-tests.yml | 107 ------------------------ .azure/ipu-tests.yml | 84 ------------------- 6 files changed, 691 deletions(-) delete mode 100644 .azure/app-cloud-e2e.yml delete mode 100644 .azure/gpu-benchmark.yml delete mode 100644 .azure/gpu-tests-lite.yml delete mode 100644 .azure/gpu-tests.yml delete mode 100644 .azure/hpu-tests.yml delete mode 100644 .azure/ipu-tests.yml diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml deleted file mode 100644 index 1c6822cf2bb54..0000000000000 --- a/.azure/app-cloud-e2e.yml +++ /dev/null @@ -1,167 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -trigger: - tags: - include: - - '*' - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - paths: - include: - - ".azure/app-cloud-e2e.yml" - - "requirements/app/**" - - "src/lightning_app/**" - - "examples/app_*" - -pr: - branches: - include: - - "master" - - "release/*" - paths: - include: - - ".azure/app-cloud-e2e.yml" - - "requirements/app/**" - - "src/lightning_app/**" - - "examples/app_*" - -# variables are automatically exported as environment variables so this will override pip's default cache dir -variables: - - name: pip_cache_dir - value: $(Pipeline.Workspace)/.pip - - name: local_id - value: $(Build.BuildId) - -jobs: - - job: App_cloud_e2e_testing - pool: azure-cpus - container: - image: mcr.microsoft.com/playwright/python:v1.25.2-focal - options: "--shm-size=2g" - strategy: - matrix: - 'App: v0_app': - name: "v0_app" - 'App: boring_app': - name: "boring_app" - 'App: template_streamlit_ui': - name: "template_streamlit_ui" - 'App: template_react_ui': - name: "template_react_ui" - 'App: template_jupyterlab': # TODO: clarify where these files lives - name: "template_jupyterlab" - 'App: idle_timeout': - name: "idle_timeout" - 'App: collect_failures': - name: "collect_failures" - 'App: custom_work_dependencies': - name: "custom_work_dependencies" - 'App: drive': - name: "drive" - 'App: payload': - name: "payload" - 'App: commands_and_api': - name: "commands_and_api" - timeoutInMinutes: "30" - cancelTimeoutInMinutes: "2" - # values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace - workspace: - clean: all - steps: - - - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' - displayName: "Set id for this PR" - condition: eq(variables['Build.Reason'], 'PullRequest') - - - bash: | - whoami - printf "local id: $(local_id)\n" - python --version - pip --version - displayName: 'Info' - - - task: Cache@2 - inputs: - key: 'pip | "$(name)" | requirements/app/base.txt' - restoreKeys: | - pip | "$(Agent.OS)" - path: $(pip_cache_dir) - displayName: Cache pip - - - bash: python -m pip install -r requirements/app/devel.txt --quiet --find-links ${TORCH_URL} - env: - TORCH_URL: https://download.pytorch.org/whl/cpu/torch_stable.html - displayName: 'Install dependencies' - - - bash: | - python -m pip install playwright - python -m playwright install # --with-deps - displayName: 'Install Playwright system dependencies' - - - bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html - displayName: 'Install lightning' - - - bash: | - rm -rf examples/app_template_jupyterlab || true - git clone https://github.com/Lightning-AI/LAI-lightning-template-jupyterlab-App examples/app_template_jupyterlab - cp examples/app_template_jupyterlab/tests/test_template_jupyterlab.py tests/tests_app_examples/test_template_jupyterlab.py - condition: eq(variables['name'], 'template_jupyterlab') - displayName: 'Clone Template Jupyter Lab Repo' - - - bash: | - rm -rf examples/app_template_react_ui || true - git clone https://github.com/Lightning-AI/lightning-template-react examples/app_template_react_ui - condition: eq(variables['name'], 'template_react_ui') - displayName: 'Clone Template React UI Repo' - - - bash: | - mkdir -p ${VIDEO_LOCATION} - ls -l examples/${TEST_APP_NAME} - ls -l tests/tests_app_examples - python -m pytest tests/tests_app_examples/test_${TEST_APP_NAME}.py::test_${TEST_APP_NAME}_example_cloud --timeout=900 --capture=no -v --color=yes - env: - HEADLESS: '1' - PACKAGE_LIGHTNING: '1' - CLOUD: '1' - VIDEO_LOCATION: '$(Build.ArtifactStagingDirectory)/videos' - PR_NUMBER: $(local_id) - TEST_APP_NAME: $(name) - HAR_LOCATION: './artifacts/hars' - SLOW_MO: '50' - # LAI_USER: $(LAI_USER) - # LAI_PASS: $(LAI_PASS) - LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) - LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) - LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) - LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) - LIGHTNING_DEBUG: '1' - displayName: 'Run the tests' - - - publish: '$(Build.ArtifactStagingDirectory)/videos' - condition: failed() - displayName: 'Publish videos' - artifact: $(name) - - - bash: | - time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" - condition: always() - env: - # LAI_USER: $(LAI_USER) - # LAI_PASS: $(LAI_PASS) - LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) - LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) - LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) - LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) - PR_NUMBER: $(local_id) - TEST_APP_NAME: $(name) - # GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning - # GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning - # GRID_URL: $(LIGHTNING_CLOUD_URL) - # _GRID_USERNAME: $(LIGHTNING_USERNAME) - displayName: 'Clean Previous Apps' diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml deleted file mode 100644 index 968186fbd275d..0000000000000 --- a/.azure/gpu-benchmark.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -trigger: - tags: - include: - - '*' - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - -pr: none - -schedules: - - cron: "0 0 * * *" # At the end of every day - displayName: Daily midnight benchmark - branches: - include: - - "master" - -jobs: - - job: benchmarks - timeoutInMinutes: "90" - cancelTimeoutInMinutes: "2" - pool: azure-jirka-spot - container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" - workspace: - clean: all - - steps: - - - bash: | - pip install -e . -r requirements/pytorch/strategies.txt - pip list - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - displayName: 'Install package' - - - bash: python -m pytest benchmarks -v --durations=0 - env: - PL_RUNNING_BENCHMARKS: 1 - PL_RUN_CUDA_TESTS: "1" - workingDirectory: tests/tests_pytorch - displayName: 'Testing: PyTorch benchmarks' diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml deleted file mode 100644 index 66fc3951b9ce1..0000000000000 --- a/.azure/gpu-tests-lite.yml +++ /dev/null @@ -1,112 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -trigger: - tags: - include: - - '*' - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - paths: - include: - - ".azure/gpu-tests-lite.yml" - - "requirements/lite/**" - - "src/lightning_lite/**" - - "tests/tests_lite/**" - - "tests/tests_pytorch/run_standalone_tests.sh" - - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above - -pr: - branches: - include: - - "master" - - "release/*" - paths: - include: - - ".azure/gpu-tests-lite.yml" - - "requirements/lite/**" - - "src/lightning_lite/**" - - "tests/tests_lite/**" - - "tests/tests_pytorch/run_standalone_tests.sh" - - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above - -jobs: - - job: testing - # how long to run the job before automatically cancelling - timeoutInMinutes: "20" - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - pool: azure-jirka-spot - container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" - # default shm size is 64m. Increase it to avoid: - # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" - workspace: - clean: all - - steps: - - bash: | - lspci | egrep 'VGA|3D' - whereis nvidia - nvidia-smi - which python && which pip - python --version - pip --version - pip list - displayName: 'Image info & NVIDIA' - - - bash: | - set -e - TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") - CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION} - pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html - pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html - pip list - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - displayName: 'Install dependencies' - - - bash: | - set -e - python requirements/collect_env_details.py - python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" - displayName: 'Env details' - - - bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 - env: - PL_RUN_CUDA_TESTS: "1" - workingDirectory: tests/tests_lite - displayName: 'Testing: Lite standard' - timeoutInMinutes: "10" - - - bash: bash run_standalone_tests.sh - workingDirectory: tests/tests_lite - env: - PL_RUN_CUDA_TESTS: "1" - PL_STANDALONE_TESTS_SOURCE: "lightning_lite" - displayName: 'Testing: Lite standalone tests' - timeoutInMinutes: "10" - - - bash: | - python -m coverage report - python -m coverage xml - python -m coverage html - python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure - ls -l - workingDirectory: tests/tests_lite - displayName: 'Statistics' - - - task: PublishTestResults@2 - displayName: 'Publish test results' - inputs: - testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' - testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: succeededOrFailed() diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml deleted file mode 100644 index 67e4f3d0bac19..0000000000000 --- a/.azure/gpu-tests.yml +++ /dev/null @@ -1,170 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -trigger: - tags: - include: - - '*' - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - -pr: - - "master" - - "release/*" - -variables: - - name: continue - value: '1' - -jobs: - - job: testing - strategy: - matrix: - 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" - # how long to run the job before automatically cancelling - timeoutInMinutes: "80" - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - pool: azure-jirka-spot - container: - image: $(image) - # default shm size is 64m. Increase it to avoid: - # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" - workspace: - clean: all - - steps: - - - bash: | - CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo $CHANGED_FILES > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "##vso[task.setvariable variable=continue]0" - else - echo "Continue" - echo "##vso[task.setvariable variable=continue]1" - fi - displayName: Skipper - - - bash: | - lspci | egrep 'VGA|3D' - whereis nvidia - nvidia-smi - which python && which pip - python --version - pip --version - pip list - displayName: 'Image info & NVIDIA' - condition: eq(variables['continue'], '1') - - - bash: | - set -e - python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" - python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" - TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") - CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION} - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} - pip install "bagua-cuda$CUDA_VERSION_BAGUA" - pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html - pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html - pip list - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - displayName: 'Install dependencies' - condition: eq(variables['continue'], '1') - - - bash: | - set -e - python requirements/collect_env_details.py - python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" - python requirements/pytorch/check-avail-strategies.py - python requirements/pytorch/check-avail-extras.py - displayName: 'Env details' - condition: eq(variables['continue'], '1') - - - bash: bash .actions/pull_legacy_checkpoints.sh - displayName: 'Get legacy checkpoints' - condition: eq(variables['continue'], '1') - - - bash: python -m coverage run --source pytorch_lightning -m pytest - workingDirectory: src/pytorch_lightning - displayName: 'Testing: PyTorch doctests' - condition: eq(variables['continue'], '1') - - - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 - env: - PL_RUN_CUDA_TESTS: "1" - workingDirectory: tests/tests_pytorch - displayName: 'Testing: PyTorch standard' - timeoutInMinutes: "35" - condition: eq(variables['continue'], '1') - - - bash: bash run_standalone_tests.sh - workingDirectory: tests/tests_pytorch - env: - PL_USE_MOCKED_MNIST: "1" - PL_RUN_CUDA_TESTS: "1" - PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning" - displayName: 'Testing: PyTorch standalone tests' - timeoutInMinutes: "35" - condition: eq(variables['continue'], '1') - - - bash: bash run_standalone_tasks.sh - workingDirectory: tests/tests_pytorch - env: - PL_USE_MOCKED_MNIST: "1" - PL_RUN_CUDA_TESTS: "1" - displayName: 'Testing: PyTorch standalone tasks' - timeoutInMinutes: "10" - condition: eq(variables['continue'], '1') - - - bash: | - python -m coverage report - python -m coverage xml - python -m coverage html - python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure - ls -l - workingDirectory: tests/tests_pytorch - displayName: 'Statistics' - condition: eq(variables['continue'], '1') - - - task: PublishTestResults@2 - displayName: 'Publish test results' - inputs: - testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' - testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: and(succeededOrFailed(), eq(variables['continue'], '1')) - - - script: | - set -e - bash run_ddp_examples.sh - bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1 - bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp - bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp --trainer.precision=16 - workingDirectory: examples - env: - PL_USE_MOCKED_MNIST: "1" - displayName: 'Testing: PyTorch examples' - condition: eq(variables['continue'], '1') - - - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0 - workingDirectory: tests/tests_pytorch - env: - PL_RUN_CUDA_TESTS: "1" - displayName: 'Testing: PyTorch benchmarks' - condition: eq(variables['continue'], '1') diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml deleted file mode 100644 index 3aea24a148314..0000000000000 --- a/.azure/hpu-tests.yml +++ /dev/null @@ -1,107 +0,0 @@ -# Pipeline to run the HPU tests in DL1 Instance - -trigger: - tags: - include: - - '*' - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - paths: - include: - - ".azure/hpu-tests.yml" - - "examples/pl_hpu/mnist_sample.py" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - -pr: - branches: - include: - - "master" - - "release/*" - paths: - include: - - ".azure/hpu-tests.yml" - - "examples/pl_hpu/mnist_sample.py" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - -jobs: - - job: testing - # how long to run the job before automatically cancelling - timeoutInMinutes: "10" - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - pool: intel-hpus - container: - image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" - workspace: - clean: all - - steps: - - script: | - /tmp/docker exec -t -u 0 cd-container \ - sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" - displayName: 'Install Sudo in container (thanks Microsoft!)' - - - bash: | - sudo apt-get install -y hwinfo - hwinfo --short - python --version - sudo pip install pip -U - displayName: 'Instance HW info' - - - bash: | - set -e - pip --version - sudo pip uninstall -y lightning pytorch-lightning - pip install -q -r .actions/requirements.txt - python .actions/assistant.py requirements-prune-pkgs torch,torchvision - pip install ".[extra,test]" - pip list - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - displayName: 'Install dependencies' - - - bash: | - hl-smi -L - lsmod | grep habanalabs - displayName: 'Check the driver status' - - - bash: | - python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'Single card HPU test' - - - bash: | - python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'Multi card(8) HPU test' - - - bash: | - python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ - 'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \ - 'plugins/precision/hpu/ops_fp32.txt' --forked \ - --junitxml=hpu1_precision_test-results.xml - workingDirectory: tests/tests_pytorch - displayName: 'HPU precision test' - - - bash: | - export PYTHONPATH="${PYTHONPATH}:$(pwd)" - python "pl_hpu/mnist_sample.py" - workingDirectory: examples - displayName: 'Testing: HPU examples' - - - task: PublishTestResults@2 - inputs: - testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml' - testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: succeededOrFailed() - displayName: 'Publish test results' diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml deleted file mode 100644 index a4d68318834a6..0000000000000 --- a/.azure/ipu-tests.yml +++ /dev/null @@ -1,84 +0,0 @@ -trigger: - tags: - include: - - '*' - branches: - include: - - master - - release/* - - refs/tags/* - -pr: - - master - - release/* - -variables: - - name: poplar_sdk - value: "poplar_sdk-ubuntu_20_04-2.3.1+793-89796d462d" - -jobs: - - job: testing - # how long to run the job before automatically cancelling - timeoutInMinutes: "15" - pool: graphcore-ipus - workspace: - clean: all - - steps: - - script: tar -xvzf /opt/poplar/${{ variables.poplar_sdk }}.tar.gz - displayName: "Extract Poplar SDK" - - - script: | - set -eux - pip debug --verbose - pip install ${{ variables.poplar_sdk }}/poptorch-*ubuntu*.whl - displayName: "Install poptorch" - - - script: | - set -eux - source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh - NUM_IPUS=$(gc-info --ipu-count) - if [[ -z "${NUM_IPUS}" ]] || [[ "${NUM_IPUS}" -eq 0 ]]; then - echo "No IPUs found to reset. Exiting" - exit 1 - fi - echo "Resetting parity on ${NUM_IPUS} IPU devices" - i=0 - while [[ i -lt "${NUM_IPUS}" ]]; do - gc-reset -d "${i}" - i=$((i + 1)) - done - displayName: "Reset IPU devices" - - - bash: | - export GIT_TERMINAL_PROMPT=1 - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -e . --requirement ./requirements/pytorch/devel.txt - pip list - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - displayName: 'Install dependencies' - - - bash: | - python requirements/collect_env_details.py - python -c "import torch" - displayName: 'Env details' - - - script: | - set -eux - source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh - source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh - python -c "import poptorch; print(poptorch.__version__)" - displayName: "Check poptorch installation" - - - bash: | - source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh - source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh - python -m coverage run --source pytorch_lightning -m pytest tests/tests_pytorch -vv --durations=50 - env: - MKL_THREADING_LAYER: "GNU" - POPTORCH_WAIT_FOR_IPU: 1 - PL_RUN_IPU_TESTS: 1 - displayName: 'Testing: PyTorch standard' From 2679278372deecd55d94eb27de20692a0f2c69ea Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:24:20 +0900 Subject: [PATCH 31/50] tmp: remove dockers --- .github/workflows/ci-pytorch-dockers.yml | 218 ----------------------- 1 file changed, 218 deletions(-) delete mode 100644 .github/workflows/ci-pytorch-dockers.yml diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml deleted file mode 100644 index 8c6509576460c..0000000000000 --- a/.github/workflows/ci-pytorch-dockers.yml +++ /dev/null @@ -1,218 +0,0 @@ -name: Docker - -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - paths: - - "dockers/**" - - "!dockers/README.md" - - "requirements.txt" - - "requirements/*.txt" - - "requirements/pytorch/*" - - "environment.yml" - - ".github/workflows/*docker*.yml" - - "setup.py" - schedule: - - cron: "0 0 * * *" # at the end of every day - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -env: - PUSH_TO_HUB: ${{ github.event_name == 'schedule' }} - -jobs: - build-pl: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - include: - # We only release one docker image per PyTorch version. - # The matrix here is the same as the one in release-docker.yml. - - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} - steps: - - uses: actions/checkout@v3 - - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v3 - with: - build-args: | - PYTHON_VERSION=${{ matrix.python_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} - file: dockers/release/Dockerfile - push: false # pushed in release-docker.yml only when PL is released - timeout-minutes: 50 - - build-xla: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - # the config used in '.circleci/config.yml`' - python_version: ["3.7"] - xla_version: ["1.12"] - steps: - - uses: actions/checkout@v3 - - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v2 - if: env.PUSH_TO_HUB == 'true' - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v3 - with: - build-args: | - PYTHON_VERSION=${{ matrix.python_version }} - XLA_VERSION=${{ matrix.xla_version }} - file: dockers/base-xla/Dockerfile - push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} - timeout-minutes: 60 - - uses: ravsamhq/notify-slack-action@v2 - if: failure() && env.PUSH_TO_HUB == 'true' - with: - status: ${{ job.status }} - token: ${{ secrets.GITHUB_TOKEN }} - notification_title: ${{ format('XLA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.xla_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - - build-cuda: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - include: - # These are the base images for PL release docker images, - # so include at least all of the combinations in release-dockers.yml. - - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} - # Used in Lightning-AI/tutorials - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - steps: - - uses: actions/checkout@v3 - - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v2 - if: env.PUSH_TO_HUB == 'true' - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v3 - with: - build-args: | - PYTHON_VERSION=${{ matrix.python_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} - file: dockers/base-cuda/Dockerfile - push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} - timeout-minutes: 95 - - uses: ravsamhq/notify-slack-action@v2 - if: failure() && env.PUSH_TO_HUB == 'true' - with: - status: ${{ job.status }} - token: ${{ secrets.GITHUB_TOKEN }} - notification_title: ${{ format('CUDA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01A5T7EY9M>' # akihironitta - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - - build-ipu: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - include: - # the config used in 'dockers/ci-runner-ipu/Dockerfile' - - {python_version: "3.9", pytorch_version: "1.9"} - steps: - - uses: actions/checkout@v3 - - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v2 - if: env.PUSH_TO_HUB == 'true' - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v3 - with: - build-args: | - PYTHON_VERSION=${{ matrix.python_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} - file: dockers/base-ipu/Dockerfile - push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - timeout-minutes: 100 - - uses: docker/build-push-action@v3 - with: - build-args: | - PYTHON_VERSION=${{ matrix.python_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} - file: dockers/ci-runner-ipu/Dockerfile - push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }} - timeout-minutes: 10 - - uses: ravsamhq/notify-slack-action@v2 - if: failure() && env.PUSH_TO_HUB == 'true' - with: - status: ${{ job.status }} - token: ${{ secrets.GITHUB_TOKEN }} - notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - - build-hpu: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - include: - # the config used in 'dockers/ci-runner-hpu/Dockerfile' - - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"} - steps: - - uses: actions/checkout@v3 - - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v2 - if: env.PUSH_TO_HUB == 'true' - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v3 - with: - build-args: | - DIST=latest - GAUDI_VERSION=${{ matrix.gaudi_version }} - PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }} - file: dockers/ci-runner-hpu/Dockerfile - push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }} - timeout-minutes: 10 - - uses: ravsamhq/notify-slack-action@v2 - if: failure() && env.PUSH_TO_HUB == 'true' - with: - status: ${{ job.status }} - token: ${{ secrets.GITHUB_TOKEN }} - notification_title: ${{ format('HPU; {0} py{1} for *{2}*', runner.os, matrix.gaudi_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U02PV6CL144> <@U0355SJN6HK>' # arao & Mythravarun N R - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - - build-NGC: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - uses: docker/build-push-action@v3 - with: - file: dockers/nvidia/Dockerfile - push: false - timeout-minutes: 55 From d20b51425e46bef56303b16adf61ec30d12ecd9e Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:25:53 +0900 Subject: [PATCH 32/50] tmp: remove others --- .github/workflows/ci-pkg-install.yml | 151 --------------------------- .github/workflows/docs-checks.yml | 119 --------------------- 2 files changed, 270 deletions(-) delete mode 100644 .github/workflows/ci-pkg-install.yml delete mode 100644 .github/workflows/docs-checks.yml diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml deleted file mode 100644 index 50e462e2c0f7d..0000000000000 --- a/.github/workflows/ci-pkg-install.yml +++ /dev/null @@ -1,151 +0,0 @@ -name: Package - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -defaults: - run: - shell: bash - -jobs: - - init-temp: - runs-on: ubuntu-20.04 - steps: - - run: mkdir pypi && touch pypi/.placeholder - - uses: actions/upload-artifact@v3 - with: - name: ci-packages-${{ github.sha }} - path: pypi - - install-standalone: - needs: init-temp - runs-on: ${{ matrix.os }} - strategy: - fail-fast: true - max-parallel: 1 - matrix: - os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] - pkg: ["app", "lite", "pytorch"] - python-version: [3.8] # , 3.9 - - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - uses: actions/download-artifact@v3 - with: - name: ci-packages-${{ github.sha }} - path: pypi - - run: ls -lh pypi/ - - - run: python -c "print('NB_DIRS=' + str(2 if '${{ matrix.pkg }}' == 'pytorch' else 1))" >> $GITHUB_ENV - - uses: ./.github/actions/pkg-check - with: - pkg-name: ${{ matrix.pkg }} - nb-dirs: ${{ env.NB_DIRS }} - - - uses: actions/upload-artifact@v3 - with: - name: ci-packages-${{ github.sha }} - path: pypi - - - uses: ./.github/actions/pkg-install - - install-meta-src: - needs: install-standalone - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - # max-parallel: 1 - matrix: - os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] - pkg: ["", "lightning"] - python-version: [3.8] # , 3.9 - - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - run: mkdir -p pypi - - uses: actions/download-artifact@v3 - if: ${{ matrix.pkg != '' }} - with: - name: ci-packages-${{ github.sha }} - path: pypi - - run: ls -lh pypi/ - - - uses: ./.github/actions/pkg-check - with: - pkg-name: ${{ matrix.pkg }} - - - uses: ./.github/actions/pkg-install - with: - pkg-name: "lightning" - pip-flags: "-U --pre --find-links ../pypi/" - - - name: Run CLI - run: python -m lightning --version - - install-meta-pypi: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - # max-parallel: 1 - matrix: - os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] - python-version: [3.8] # , 3.9 - - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Dowload package - # todo: download also lite after it is fist published - run: | - pip install -q -r .actions/requirements.txt - for pkg in 'app' 'pytorch' ; do - python .actions/assistant.py download-package "$pkg" --folder pypi - done - ls -lh pypi/ - - - name: Unzip packages - working-directory: pypi - run: for file in `ls *.gz`; do tar -xzf $file; done - - name: Show upacked pkgs - if: runner.os == 'linux' - run: | - sudo apt install -y tree - tree pypi/ -L 3 - - - name: Miror source - run: | - pip install -q -r .actions/requirements.txt - python .actions/assistant.py mirror-pkg2source pypi src - ls -R src/ - - - uses: ./.github/actions/pkg-check - with: - pkg-name: "lightning" - - - uses: ./.github/actions/pkg-install - with: - pkg-name: "lightning" - pip-flags: "-U --pre --find-links ../pypi/" - - - name: Run CLI - run: python -m lightning --version diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml deleted file mode 100644 index c63fe01709fad..0000000000000 --- a/.github/workflows/docs-checks.yml +++ /dev/null @@ -1,119 +0,0 @@ -name: Check Docs -# https://github.com/marketplace/actions/sphinx-build - -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -jobs: - make-doctest: - runs-on: ubuntu-20.04 - needs: make-html # make it depending on build docs to reduce load - strategy: - fail-fast: false - matrix: - pkg: ["app", "pytorch"] # TODO: , "lit" - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - - name: Reset caching - run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - - # Note: This uses an internal pip API and may not always work - # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - - name: Cache pip - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-docs-test-pip-td${{ env.TIME_PERIOD }}-${{ hashFiles('requirements/${{ matrix.pkg }}/*.txt') }} - restore-keys: | - ${{ runner.os }}-docs-test-pip-td${{ env.TIME_PERIOD }}- - - - name: Install dependencies - env: - FREEZE_REQUIREMENTS: 1 - PACKAGE_NAME: ${{ matrix.pkg }} - run: | - sudo apt-get update - sudo apt-get install -y cmake pandoc - pip --version - # python -m pip install --upgrade --user pip - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html - pip install -r requirements/${{ matrix.pkg }}/devel.txt - pip list - shell: bash - - - name: Test Documentation - env: - SPHINX_MOCK_REQUIREMENTS: 0 - working-directory: ./docs/source-${{ matrix.pkg }} - run: | - # ToDo: proper parametrize - # First run the same pipeline as Read-The-Docs - make doctest - make coverage - - make-html: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - pkg: ["app", "pytorch", "lit"] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - # lfs: true - - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - # Note: This uses an internal pip API and may not always work - # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - - name: Cache pip - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-docs-make-pip-${{ hashFiles('requirements/${{ matrix.pkg }}/*.txt') }} - restore-keys: | - ${{ runner.os }}-docs-make-pip- - - - name: Install dependencies - env: - FREEZE_REQUIREMENTS: 1 - PACKAGE_NAME: ${{ matrix.pkg }} - run: | - sudo apt-get update - sudo apt-get install -y cmake pandoc - pip --version - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html - # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux - sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures - pip list - shell: bash - - - name: Make Documentation - working-directory: ./docs/source-${{ matrix.pkg }} - run: | - # ToDo: rather use python cmd - # First run the same pipeline as Read-The-Docs - make html --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going" - - - name: Upload built docs - uses: actions/upload-artifact@v3 - with: - name: docs-${{ matrix.pkg }}-${{ github.sha }} - path: docs/build/html/ - # Use always() to always run this step to publish test results when there are test failures - if: success() From 66c285dc99c0d4895103313d9d543ff880ca1ec4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:27:46 +0900 Subject: [PATCH 33/50] Run all combinations --- .github/workflows/ci-pytorch-test-full.yml | 45 ++++++++++++---------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 0981321b2a791..1753351e8a09d 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,27 +20,30 @@ jobs: strategy: fail-fast: false matrix: - include: - - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} - - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} - - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} - - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} - - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} - - {os: "macos-11", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - - {os: "macos-11", python-version: "3.8", pytorch-version: "1.10"} - - {os: "macos-11", python-version: "3.9", pytorch-version: "1.11"} - - {os: "macos-11", python-version: "3.10", pytorch-version: "1.12"} - - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.12"} - - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.10"} - - {os: "windows-2022", python-version: "3.9", pytorch-version: "1.11"} - - {os: "windows-2022", python-version: "3.10", pytorch-version: "1.12"} - # TODO: re-enable RC testing - # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} + os: ["ubuntu-20.04", "macos-11", "windows-2022"] + python-version: ["3.7", "3.8", "3.9", "3.10"] + pytorch-version: ["1.9", "1.10", "1.11", "1.12"] + # include: + # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} + # - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job + # - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} + # - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} + # - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} + # - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + # - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} + # - {os: "macos-11", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job + # - {os: "macos-11", python-version: "3.8", pytorch-version: "1.10"} + # - {os: "macos-11", python-version: "3.9", pytorch-version: "1.11"} + # - {os: "macos-11", python-version: "3.10", pytorch-version: "1.12"} + # - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + # - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.12"} + # - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job + # - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.10"} + # - {os: "windows-2022", python-version: "3.9", pytorch-version: "1.11"} + # - {os: "windows-2022", python-version: "3.10", pytorch-version: "1.12"} + # # TODO: re-enable RC testing + # # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} timeout-minutes: 50 From 371a59489d37d1e6dfe6d8e3e5fc2e58ecefe4f1 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:28:41 +0900 Subject: [PATCH 34/50] Include oldest --- .github/workflows/ci-pytorch-test-full.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 1753351e8a09d..21529e48c2b80 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -23,6 +23,10 @@ jobs: os: ["ubuntu-20.04", "macos-11", "windows-2022"] python-version: ["3.7", "3.8", "3.9", "3.10"] pytorch-version: ["1.9", "1.10", "1.11", "1.12"] + include: + - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} # include: # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} From e2df9645d85690e509c3a05ad03f173ee3b51101 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:35:52 +0900 Subject: [PATCH 35/50] Exclude no Python 3.10 distributions --- .github/workflows/ci-pytorch-test-full.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 21529e48c2b80..83ffe36a8546c 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -27,6 +27,9 @@ jobs: - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + exclude: + - {python-version: "3.10", pytorch-version: "1.9"} # No distribution with Python 3.10 + - {python-version: "3.10", pytorch-version: "1.10"} # No distribution with Python 3.10 # include: # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} From 4a7978dcb3499ce754306580412110b7a42920cd Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 14:46:50 +0900 Subject: [PATCH 36/50] tmp: no concurrency --- .github/workflows/ci-pytorch-test-full.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 83ffe36a8546c..a2118ec98be4f 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -8,10 +8,6 @@ on: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - jobs: pl-cpu: From 3581c9778c42ce45d463310d5df9e485feac907b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 16:58:18 +0900 Subject: [PATCH 37/50] tmp: double timeout --- .github/workflows/ci-pytorch-test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index a2118ec98be4f..2cb2a2837f51a 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -48,7 +48,7 @@ jobs: # # TODO: re-enable RC testing # # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} - timeout-minutes: 50 + timeout-minutes: 100 steps: - uses: actions/checkout@v3 From c6e15a4b2611b2c0a1e700813e559b06cf5824a0 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 17:17:53 +0900 Subject: [PATCH 38/50] Add pytest log reporter --- .github/workflows/ci-pytorch-test-full.yml | 36 +++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 2cb2a2837f51a..b9e8d25679ccb 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -154,7 +154,14 @@ jobs: - name: Testing PyTorch working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 - run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml --report-log=log-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.json + + - name: Upload pytest log file + if: always() + uses: actions/upload-artifact@v3 + with: + name: pytest-logs-pl + path: tests/tests_pytorch/log-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.json - name: Upload pytest results if: failure() @@ -185,3 +192,30 @@ jobs: flags: cpu,pytest,python${{ matrix.python-version }} name: CPU-coverage fail_ci_if_error: false + + report-results: + runs-on: ubuntu-latest + needs: pl-cpu + steps: + - name: Download artifacts + uses: actions/download-artifact@v3 + with: + name: pytest-log-pl + path: pytest_logs + + - name: Check files + run: | + ls -al + ls -al pytest_logs/ + + - name: Show tests that are skipped by any configurations + working-directory: pytest_logs + run: | + # TODO: Simplify commands + jq 'select(."$report_type" == "TestReport") | select(.outcome == "skipped") | [.nodeid, .outcome] | join(" ")' *.json | sort | uniq -c | sort -r | sed 's/^[ ]*//' + + - name: Show tests that are skipped by all configurations + working-directory: pytest_logs + run: | + # TODO: Simplify commands + jq 'select(."$report_type" == "TestReport") | select(.outcome == "skipped") | [.nodeid, .outcome] | join(" ")' *.json | sort | uniq -c | sort -r | sed 's/^[ ]*//' | grep ^$(ls *.json | wc -l | sed "s/^[ ]*//") From 5332ef5e22de71635e6394837f2817321f45c53a Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 17:24:09 +0900 Subject: [PATCH 39/50] Add pytest-reportlog --- requirements/pytorch/test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index 22bd62bef9311..66027c79c8808 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -4,6 +4,7 @@ pytest>=7.0, <=7.1.2 pytest-cov <=3.0.0 pytest-forked <=1.4.0 pytest-rerunfailures>=10.2 +pytest-reportlog<=0.1.2 pre-commit>=1.0 mypy==0.971 From 598f315d1b218a1133ae272fc77b071ee9f1f834 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 17:30:21 +0900 Subject: [PATCH 40/50] Fewer jobs --- .github/workflows/ci-pytorch-test-full.yml | 51 ++++++++++------------ 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index b9e8d25679ccb..a33c990d195d8 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -16,37 +16,34 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-20.04", "macos-11", "windows-2022"] - python-version: ["3.7", "3.8", "3.9", "3.10"] - pytorch-version: ["1.9", "1.10", "1.11", "1.12"] - include: - - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - exclude: - - {python-version: "3.10", pytorch-version: "1.9"} # No distribution with Python 3.10 - - {python-version: "3.10", pytorch-version: "1.10"} # No distribution with Python 3.10 + # os: ["ubuntu-20.04", "macos-11", "windows-2022"] + # python-version: ["3.7", "3.8", "3.9", "3.10"] + # pytorch-version: ["1.9", "1.10", "1.11", "1.12"] # include: # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} - # - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - # - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} - # - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} - # - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} # - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} - # - {os: "macos-11", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - # - {os: "macos-11", python-version: "3.8", pytorch-version: "1.10"} - # - {os: "macos-11", python-version: "3.9", pytorch-version: "1.11"} - # - {os: "macos-11", python-version: "3.10", pytorch-version: "1.12"} # - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.12"} - # - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.9"} # non-oldest to simulate conda job - # - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.10"} - # - {os: "windows-2022", python-version: "3.9", pytorch-version: "1.11"} - # - {os: "windows-2022", python-version: "3.10", pytorch-version: "1.12"} - # # TODO: re-enable RC testing - # # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} + # exclude: + # - {python-version: "3.10", pytorch-version: "1.9"} # No distribution with Python 3.10 + # - {python-version: "3.10", pytorch-version: "1.10"} # No distribution with Python 3.10 + include: + - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} + - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} + - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} + - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} + - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} + - {os: "macos-11", python-version: "3.8", pytorch-version: "1.10"} + - {os: "macos-11", python-version: "3.9", pytorch-version: "1.11"} + - {os: "macos-11", python-version: "3.10", pytorch-version: "1.12"} + - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.12"} + - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.10"} + - {os: "windows-2022", python-version: "3.9", pytorch-version: "1.11"} + - {os: "windows-2022", python-version: "3.10", pytorch-version: "1.12"} + # TODO: re-enable RC testing + # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} timeout-minutes: 100 From eb9d490e26b321b50a990e11331e842ba764d902 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 17:30:40 +0900 Subject: [PATCH 41/50] Revert "tmp: no concurrency" This reverts commit 4a7978dcb3499ce754306580412110b7a42920cd. --- .github/workflows/ci-pytorch-test-full.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index a33c990d195d8..86b7640075c05 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -8,6 +8,10 @@ on: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} + jobs: pl-cpu: From 9f6ff157fdf3410b56068308ceae6d9469443ca9 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 18 Sep 2022 20:57:44 +0900 Subject: [PATCH 42/50] fix artifact name --- .github/workflows/ci-pytorch-test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 86b7640075c05..49aab8347bf58 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -201,7 +201,7 @@ jobs: - name: Download artifacts uses: actions/download-artifact@v3 with: - name: pytest-log-pl + name: pytest-logs-pl path: pytest_logs - name: Check files From 3556f649b3ae27c8abd5bd932efd24deeddcfc2f Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 20:07:40 +0900 Subject: [PATCH 43/50] Revert test reports --- .github/workflows/ci-pytorch-test-full.yml | 46 +--------------------- requirements/pytorch/test.txt | 1 - 2 files changed, 1 insertion(+), 46 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 6ed7f058f62bb..783a25aacf63f 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -29,16 +29,6 @@ jobs: strategy: fail-fast: false matrix: - # os: ["ubuntu-20.04", "macos-11", "windows-2022"] - # python-version: ["3.7", "3.8", "3.9", "3.10"] - # pytorch-version: ["1.9", "1.10", "1.11", "1.12"] - # include: - # - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # exclude: - # - {python-version: "3.10", pytorch-version: "1.9"} # No distribution with Python 3.10 - # - {python-version: "3.10", pytorch-version: "1.10"} # No distribution with Python 3.10 include: - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} @@ -164,7 +154,7 @@ jobs: - name: Testing PyTorch working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 - run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml --report-log=log-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.json + run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Upload pytest log file if: always() @@ -173,13 +163,6 @@ jobs: name: pytest-logs-pl path: tests/tests_pytorch/log-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.json - - name: Upload pytest results - if: failure() - uses: actions/upload-artifact@v3 - with: - name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} - path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - name: Run Examples working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 @@ -202,30 +185,3 @@ jobs: flags: cpu,pytest,python${{ matrix.python-version }} name: CPU-coverage fail_ci_if_error: false - - report-results: - runs-on: ubuntu-latest - needs: pl-cpu - steps: - - name: Download artifacts - uses: actions/download-artifact@v3 - with: - name: pytest-logs-pl - path: pytest_logs - - - name: Check files - run: | - ls -al - ls -al pytest_logs/ - - - name: Show tests that are skipped by any configurations - working-directory: pytest_logs - run: | - # TODO: Simplify commands - jq 'select(."$report_type" == "TestReport") | select(.outcome == "skipped") | [.nodeid, .outcome] | join(" ")' *.json | sort | uniq -c | sort -r | sed 's/^[ ]*//' - - - name: Show tests that are skipped by all configurations - working-directory: pytest_logs - run: | - # TODO: Simplify commands - jq 'select(."$report_type" == "TestReport") | select(.outcome == "skipped") | [.nodeid, .outcome] | join(" ")' *.json | sort | uniq -c | sort -r | sed 's/^[ ]*//' | grep ^$(ls *.json | wc -l | sed "s/^[ ]*//") diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index b02fc2514aa1f..a6fa5908bb36d 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -4,7 +4,6 @@ pytest>=7.0, <=7.1.2 pytest-cov <=3.0.0 pytest-forked <=1.4.0 pytest-rerunfailures>=10.2 -pytest-reportlog<=0.1.2 pre-commit>=1.0 mypy==0.971 From f3f8c8cca2ebdafafa3093b3312f1c42e7d6a874 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 20:08:40 +0900 Subject: [PATCH 44/50] Revert unrelated changes --- .azure/app-cloud-e2e.yml | 184 +++++++++++++++++++++++++++ .azure/gpu-benchmark.yml | 51 ++++++++ .azure/gpu-tests-lite.yml | 112 ++++++++++++++++ .azure/gpu-tests.yml | 179 ++++++++++++++++++++++++++ .azure/hpu-tests.yml | 107 ++++++++++++++++ .azure/ipu-tests.yml | 98 ++++++++++++++ .github/workflows/ci-pkg-install.yml | 151 ++++++++++++++++++++++ .github/workflows/docs-checks.yml | 115 +++++++++++++++++ 8 files changed, 997 insertions(+) create mode 100644 .azure/app-cloud-e2e.yml create mode 100644 .azure/gpu-benchmark.yml create mode 100644 .azure/gpu-tests-lite.yml create mode 100644 .azure/gpu-tests.yml create mode 100644 .azure/hpu-tests.yml create mode 100644 .azure/ipu-tests.yml create mode 100644 .github/workflows/ci-pkg-install.yml create mode 100644 .github/workflows/docs-checks.yml diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml new file mode 100644 index 0000000000000..fc72798f41f7b --- /dev/null +++ b/.azure/app-cloud-e2e.yml @@ -0,0 +1,184 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + paths: + include: + - ".azure/app-cloud-e2e.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "examples/app_*" + +pr: + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/app-cloud-e2e.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "examples/app_*" + +# variables are automatically exported as environment variables so this will override pip's default cache dir +variables: + - name: pip_cache_dir + value: $(Pipeline.Workspace)/.pip + - name: local_id + value: $(Build.BuildId) + +jobs: + - job: App_cloud_e2e_testing + pool: azure-cpus + container: + image: mcr.microsoft.com/playwright/python:v1.25.2-focal + options: "--shm-size=4gb" + strategy: + matrix: + 'App: v0_app': + name: "v0_app" + 'App: boring_app': + name: "boring_app" + 'App: template_streamlit_ui': + name: "template_streamlit_ui" + 'App: template_react_ui': + name: "template_react_ui" + 'App: template_jupyterlab': # TODO: clarify where these files lives + name: "template_jupyterlab" + 'App: idle_timeout': + name: "idle_timeout" + 'App: collect_failures': + name: "collect_failures" + 'App: custom_work_dependencies': + name: "custom_work_dependencies" + 'App: drive': + name: "drive" + 'App: payload': + name: "payload" + 'App: commands_and_api': + name: "commands_and_api" + 'App: quick_start': + name: "quick_start" + timeoutInMinutes: "30" + cancelTimeoutInMinutes: "2" + # values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace + workspace: + clean: all + steps: + + - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' + displayName: "Set id for this PR" + condition: eq(variables['Build.Reason'], 'PullRequest') + + - bash: | + whoami + printf "local id: $(local_id)\n" + python --version + pip --version + displayName: 'Info' + + - task: Cache@2 + condition: ne(variables['name'], 'quick_start') + inputs: + key: 'pip | "$(name)" | requirements/app/base.txt' + restoreKeys: | + pip | "$(Agent.OS)" + path: $(pip_cache_dir) + displayName: Cache pip + + - bash: git restore . && python -m pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + displayName: 'Install lightning app' + env: + PACKAGE_NAME: app + + - bash: git restore . && python -m pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + displayName: 'Install pytorch lightning' + env: + PACKAGE_NAME: pytorch + + - bash: python -m pip install -r requirements/app/test.txt -r requirements/app/ui.txt + displayName: 'Install dependencies' + + - bash: | + python -m pip install playwright + python -m playwright install # --with-deps + displayName: 'Install Playwright system dependencies' + + - bash: | + rm -rf examples/app_template_jupyterlab || true + git clone https://github.com/Lightning-AI/LAI-lightning-template-jupyterlab-App examples/app_template_jupyterlab + cp examples/app_template_jupyterlab/tests/test_template_jupyterlab.py tests/tests_app_examples/test_template_jupyterlab.py + condition: eq(variables['name'], 'template_jupyterlab') + displayName: 'Clone Template Jupyter Lab Repo' + + - bash: | + rm -rf examples/app_template_react_ui || true + git clone https://github.com/Lightning-AI/lightning-template-react examples/app_template_react_ui + condition: eq(variables['name'], 'template_react_ui') + displayName: 'Clone Template React UI Repo' + + - bash: python -m lightning install app lightning/quick-start -y + condition: eq(variables['name'], 'quick_start') + displayName: 'Install Quick Start' + + - bash: | + pip --version + pip list + displayName: 'List pip dependency' + + - bash: | + mkdir -p ${VIDEO_LOCATION} + ls -l examples/${TEST_APP_NAME} + ls -l tests/tests_app_examples + python -m pytest tests/tests_app_examples/test_${TEST_APP_NAME}.py::test_${TEST_APP_NAME}_example_cloud --timeout=1200 --capture=no -v --color=yes + env: + HEADLESS: '1' + PACKAGE_LIGHTNING: '1' + CLOUD: '1' + VIDEO_LOCATION: '$(Build.ArtifactStagingDirectory)/videos' + PR_NUMBER: $(local_id) + TEST_APP_NAME: $(name) + HAR_LOCATION: './artifacts/hars' + SLOW_MO: '50' + # LAI_USER: $(LAI_USER) + # LAI_PASS: $(LAI_PASS) + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) + LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) + LIGHTNING_DEBUG: '1' + displayName: 'Run the tests' + + - publish: '$(Build.ArtifactStagingDirectory)/videos' + condition: failed() + displayName: 'Publish videos' + artifact: $(name) + + - bash: | + time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" + condition: always() + env: + # LAI_USER: $(LAI_USER) + # LAI_PASS: $(LAI_PASS) + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) + LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) + PR_NUMBER: $(local_id) + TEST_APP_NAME: $(name) + # GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning + # GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning + # GRID_URL: $(LIGHTNING_CLOUD_URL) + # _GRID_USERNAME: $(LIGHTNING_USERNAME) + displayName: 'Clean Previous Apps' diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml new file mode 100644 index 0000000000000..968186fbd275d --- /dev/null +++ b/.azure/gpu-benchmark.yml @@ -0,0 +1,51 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + +pr: none + +schedules: + - cron: "0 0 * * *" # At the end of every day + displayName: Daily midnight benchmark + branches: + include: + - "master" + +jobs: + - job: benchmarks + timeoutInMinutes: "90" + cancelTimeoutInMinutes: "2" + pool: azure-jirka-spot + container: + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" + workspace: + clean: all + + steps: + + - bash: | + pip install -e . -r requirements/pytorch/strategies.txt + pip list + env: + PACKAGE_NAME: pytorch + FREEZE_REQUIREMENTS: 1 + displayName: 'Install package' + + - bash: python -m pytest benchmarks -v --durations=0 + env: + PL_RUNNING_BENCHMARKS: 1 + PL_RUN_CUDA_TESTS: "1" + workingDirectory: tests/tests_pytorch + displayName: 'Testing: PyTorch benchmarks' diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml new file mode 100644 index 0000000000000..66fc3951b9ce1 --- /dev/null +++ b/.azure/gpu-tests-lite.yml @@ -0,0 +1,112 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + paths: + include: + - ".azure/gpu-tests-lite.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + - "tests/tests_pytorch/run_standalone_tests.sh" + - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above + +pr: + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/gpu-tests-lite.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + - "tests/tests_pytorch/run_standalone_tests.sh" + - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above + +jobs: + - job: testing + # how long to run the job before automatically cancelling + timeoutInMinutes: "20" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + pool: azure-jirka-spot + container: + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + # default shm size is 64m. Increase it to avoid: + # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" + workspace: + clean: all + + steps: + - bash: | + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + which python && which pip + python --version + pip --version + pip list + displayName: 'Image info & NVIDIA' + + - bash: | + set -e + TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") + python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION} + pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip list + env: + PACKAGE_NAME: pytorch + FREEZE_REQUIREMENTS: 1 + displayName: 'Install dependencies' + + - bash: | + set -e + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + displayName: 'Env details' + + - bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + env: + PL_RUN_CUDA_TESTS: "1" + workingDirectory: tests/tests_lite + displayName: 'Testing: Lite standard' + timeoutInMinutes: "10" + + - bash: bash run_standalone_tests.sh + workingDirectory: tests/tests_lite + env: + PL_RUN_CUDA_TESTS: "1" + PL_STANDALONE_TESTS_SOURCE: "lightning_lite" + displayName: 'Testing: Lite standalone tests' + timeoutInMinutes: "10" + + - bash: | + python -m coverage report + python -m coverage xml + python -m coverage html + python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure + ls -l + workingDirectory: tests/tests_lite + displayName: 'Statistics' + + - task: PublishTestResults@2 + displayName: 'Publish test results' + inputs: + testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' + testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' + condition: succeededOrFailed() diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml new file mode 100644 index 0000000000000..1e589e708cb39 --- /dev/null +++ b/.azure/gpu-tests.yml @@ -0,0 +1,179 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + paths: + include: + - ".azure/gpu-tests.yml" + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + +pr: + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/gpu-tests.yml" + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + +jobs: + - job: testing + strategy: + matrix: + 'PyTorch - stable': + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + # how long to run the job before automatically cancelling + timeoutInMinutes: "80" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + pool: azure-jirka-spot + container: + image: $(image) + # default shm size is 64m. Increase it to avoid: + # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" + workspace: + clean: all + + steps: + - bash: | + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + which python && which pip + python --version + pip --version + pip list + displayName: 'Image info & NVIDIA' + + - bash: | + set -e + python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" + python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" + TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} + pip install "bagua-cuda$CUDA_VERSION_BAGUA" + pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip list + env: + PACKAGE_NAME: pytorch + FREEZE_REQUIREMENTS: 1 + displayName: 'Install dependencies' + + - bash: | + set -e + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + python requirements/pytorch/check-avail-strategies.py + python requirements/pytorch/check-avail-extras.py + displayName: 'Env details' + + - bash: bash .actions/pull_legacy_checkpoints.sh + displayName: 'Get legacy checkpoints' + + - bash: python -m coverage run --source pytorch_lightning -m pytest + workingDirectory: src/pytorch_lightning + displayName: 'Testing: PyTorch doctests' + + - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + env: + PL_RUN_CUDA_TESTS: "1" + workingDirectory: tests/tests_pytorch + displayName: 'Testing: PyTorch standard' + timeoutInMinutes: "35" + + - bash: bash run_standalone_tests.sh + workingDirectory: tests/tests_pytorch + env: + PL_USE_MOCKED_MNIST: "1" + PL_RUN_CUDA_TESTS: "1" + PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning" + displayName: 'Testing: PyTorch standalone tests' + timeoutInMinutes: "35" + + - bash: bash run_standalone_tasks.sh + workingDirectory: tests/tests_pytorch + env: + PL_USE_MOCKED_MNIST: "1" + PL_RUN_CUDA_TESTS: "1" + displayName: 'Testing: PyTorch standalone tasks' + timeoutInMinutes: "10" + + - bash: | + python -m coverage report + python -m coverage xml + python -m coverage html + python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure + ls -l + workingDirectory: tests/tests_pytorch + displayName: 'Statistics' + + - task: PublishTestResults@2 + displayName: 'Publish test results' + inputs: + testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' + testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' + condition: succeededOrFailed() + + - script: | + set -e + bash run_ddp_examples.sh + bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1 + bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp + bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp --trainer.precision=16 + workingDirectory: examples + env: + PL_USE_MOCKED_MNIST: "1" + displayName: 'Testing: PyTorch examples' + + - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0 + workingDirectory: tests/tests_pytorch + env: + PL_RUN_CUDA_TESTS: "1" + displayName: 'Testing: PyTorch benchmarks' diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml new file mode 100644 index 0000000000000..324c364d61f20 --- /dev/null +++ b/.azure/hpu-tests.yml @@ -0,0 +1,107 @@ +# Pipeline to run the HPU tests in DL1 Instance + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + paths: + include: + - ".azure/hpu-tests.yml" + - "examples/pl_hpu/mnist_sample.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + +pr: + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/hpu-tests.yml" + - "examples/pl_hpu/mnist_sample.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + +jobs: + - job: testing + # how long to run the job before automatically cancelling + timeoutInMinutes: "10" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + pool: intel-hpus + container: + image: "vault.habana.ai/gaudi-docker/1.6.0/ubuntu20.04/habanalabs/pytorch-installer-1.12.0:latest" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + workspace: + clean: all + + steps: + - script: | + /tmp/docker exec -t -u 0 cd-container \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + + - bash: | + sudo apt-get install -y hwinfo + hwinfo --short + python --version + sudo pip install pip -U + displayName: 'Instance HW info' + + - bash: | + set -e + pip --version + sudo pip uninstall -y lightning pytorch-lightning + pip install -q -r .actions/requirements.txt + python .actions/assistant.py requirements-prune-pkgs torch,torchvision + pip install ".[extra,test]" + pip list + env: + PACKAGE_NAME: pytorch + FREEZE_REQUIREMENTS: 1 + displayName: 'Install dependencies' + + - bash: | + hl-smi -L + lsmod | grep habanalabs + displayName: 'Check the driver status' + + - bash: | + python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'Single card HPU test' + + - bash: | + python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'Multi card(8) HPU test' + + - bash: | + python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ + 'plugins/precision/hpu/ops_bf16.txt' --hmp-fp32 \ + 'plugins/precision/hpu/ops_fp32.txt' --forked \ + --junitxml=hpu1_precision_test-results.xml + workingDirectory: tests/tests_pytorch + displayName: 'HPU precision test' + + - bash: | + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + python "pl_hpu/mnist_sample.py" + workingDirectory: examples + displayName: 'Testing: HPU examples' + + - task: PublishTestResults@2 + inputs: + testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml' + testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' + condition: succeededOrFailed() + displayName: 'Publish test results' diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml new file mode 100644 index 0000000000000..e6b6fcb246a08 --- /dev/null +++ b/.azure/ipu-tests.yml @@ -0,0 +1,98 @@ +trigger: + tags: + include: + - '*' + branches: + include: + - master + - release/* + - refs/tags/* + paths: + include: + - ".azure/ipu-tests.yml" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + +pr: + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/ipu-tests.yml" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + +variables: + - name: poplar_sdk + value: "poplar_sdk-ubuntu_20_04-2.3.1+793-89796d462d" + +jobs: + - job: testing + # how long to run the job before automatically cancelling + timeoutInMinutes: "15" + pool: graphcore-ipus + workspace: + clean: all + + steps: + - script: tar -xvzf /opt/poplar/${{ variables.poplar_sdk }}.tar.gz + displayName: "Extract Poplar SDK" + + - script: | + set -eux + pip debug --verbose + pip install ${{ variables.poplar_sdk }}/poptorch-*ubuntu*.whl + displayName: "Install poptorch" + + - script: | + set -eux + source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh + NUM_IPUS=$(gc-info --ipu-count) + if [[ -z "${NUM_IPUS}" ]] || [[ "${NUM_IPUS}" -eq 0 ]]; then + echo "No IPUs found to reset. Exiting" + exit 1 + fi + echo "Resetting parity on ${NUM_IPUS} IPU devices" + i=0 + while [[ i -lt "${NUM_IPUS}" ]]; do + gc-reset -d "${i}" + i=$((i + 1)) + done + displayName: "Reset IPU devices" + + - bash: | + export GIT_TERMINAL_PROMPT=1 + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt + pip install -e . --requirement ./requirements/pytorch/devel.txt + pip list + env: + PACKAGE_NAME: pytorch + FREEZE_REQUIREMENTS: 1 + displayName: 'Install dependencies' + + - bash: | + python requirements/collect_env_details.py + python -c "import torch" + displayName: 'Env details' + + - script: | + set -eux + source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh + source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh + python -c "import poptorch; print(poptorch.__version__)" + displayName: "Check poptorch installation" + + - bash: | + source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh + source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh + python -m coverage run --source pytorch_lightning -m pytest tests/tests_pytorch -vv --durations=50 + env: + MKL_THREADING_LAYER: "GNU" + POPTORCH_WAIT_FOR_IPU: 1 + PL_RUN_IPU_TESTS: 1 + displayName: 'Testing: PyTorch standard' diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml new file mode 100644 index 0000000000000..50e462e2c0f7d --- /dev/null +++ b/.github/workflows/ci-pkg-install.yml @@ -0,0 +1,151 @@ +name: Package + +# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows +on: + push: + branches: [master, "release/*"] + pull_request: + branches: [master, "release/*"] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} + +defaults: + run: + shell: bash + +jobs: + + init-temp: + runs-on: ubuntu-20.04 + steps: + - run: mkdir pypi && touch pypi/.placeholder + - uses: actions/upload-artifact@v3 + with: + name: ci-packages-${{ github.sha }} + path: pypi + + install-standalone: + needs: init-temp + runs-on: ${{ matrix.os }} + strategy: + fail-fast: true + max-parallel: 1 + matrix: + os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] + pkg: ["app", "lite", "pytorch"] + python-version: [3.8] # , 3.9 + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - uses: actions/download-artifact@v3 + with: + name: ci-packages-${{ github.sha }} + path: pypi + - run: ls -lh pypi/ + + - run: python -c "print('NB_DIRS=' + str(2 if '${{ matrix.pkg }}' == 'pytorch' else 1))" >> $GITHUB_ENV + - uses: ./.github/actions/pkg-check + with: + pkg-name: ${{ matrix.pkg }} + nb-dirs: ${{ env.NB_DIRS }} + + - uses: actions/upload-artifact@v3 + with: + name: ci-packages-${{ github.sha }} + path: pypi + + - uses: ./.github/actions/pkg-install + + install-meta-src: + needs: install-standalone + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + # max-parallel: 1 + matrix: + os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] + pkg: ["", "lightning"] + python-version: [3.8] # , 3.9 + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - run: mkdir -p pypi + - uses: actions/download-artifact@v3 + if: ${{ matrix.pkg != '' }} + with: + name: ci-packages-${{ github.sha }} + path: pypi + - run: ls -lh pypi/ + + - uses: ./.github/actions/pkg-check + with: + pkg-name: ${{ matrix.pkg }} + + - uses: ./.github/actions/pkg-install + with: + pkg-name: "lightning" + pip-flags: "-U --pre --find-links ../pypi/" + + - name: Run CLI + run: python -m lightning --version + + install-meta-pypi: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + # max-parallel: 1 + matrix: + os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] + python-version: [3.8] # , 3.9 + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Dowload package + # todo: download also lite after it is fist published + run: | + pip install -q -r .actions/requirements.txt + for pkg in 'app' 'pytorch' ; do + python .actions/assistant.py download-package "$pkg" --folder pypi + done + ls -lh pypi/ + + - name: Unzip packages + working-directory: pypi + run: for file in `ls *.gz`; do tar -xzf $file; done + - name: Show upacked pkgs + if: runner.os == 'linux' + run: | + sudo apt install -y tree + tree pypi/ -L 3 + + - name: Miror source + run: | + pip install -q -r .actions/requirements.txt + python .actions/assistant.py mirror-pkg2source pypi src + ls -R src/ + + - uses: ./.github/actions/pkg-check + with: + pkg-name: "lightning" + + - uses: ./.github/actions/pkg-install + with: + pkg-name: "lightning" + pip-flags: "-U --pre --find-links ../pypi/" + + - name: Run CLI + run: python -m lightning --version diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml new file mode 100644 index 0000000000000..6d39908d0e369 --- /dev/null +++ b/.github/workflows/docs-checks.yml @@ -0,0 +1,115 @@ +name: Check Docs +# https://github.com/marketplace/actions/sphinx-build + +on: + push: + branches: [master, "release/*"] + pull_request: + branches: [master, "release/*"] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} + +jobs: + make-doctest: + runs-on: ubuntu-20.04 + needs: make-html # make it depending on build docs to reduce load + strategy: + fail-fast: false + matrix: + pkg: ["app", "pytorch"] # TODO: , "lit" + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Reset caching + run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV + + # Note: This uses an internal pip API and may not always work + # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow + - name: Cache pip + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-docs-test-pip-td${{ env.TIME_PERIOD }}-${{ hashFiles('requirements/${{ matrix.pkg }}/*.txt') }} + restore-keys: | + ${{ runner.os }}-docs-test-pip-td${{ env.TIME_PERIOD }}- + + - name: Install dependencies + env: + FREEZE_REQUIREMENTS: 1 + run: | + sudo apt-get update + sudo apt-get install -y cmake pandoc + pip --version + # python -m pip install --upgrade --user pip + pip install -r requirements/${{ matrix.pkg }}/docs.txt -r requirements/${{ matrix.pkg }}/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e . + pip list + shell: bash + + - name: Test Documentation + env: + SPHINX_MOCK_REQUIREMENTS: 0 + working-directory: ./docs/source-${{ matrix.pkg }} + run: | + # ToDo: proper parametrize + # First run the same pipeline as Read-The-Docs + make doctest + make coverage + + make-html: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + pkg: ["app", "pytorch", "lit"] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + # lfs: true + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + + # Note: This uses an internal pip API and may not always work + # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow + - name: Cache pip + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-docs-make-pip-${{ hashFiles('requirements/${{ matrix.pkg }}/*.txt') }} + restore-keys: | + ${{ runner.os }}-docs-make-pip- + + - name: Install dependencies + env: + FREEZE_REQUIREMENTS: 1 + run: | + sudo apt-get update + sudo apt-get install -y cmake pandoc texlive-latex-extra dvipng texlive-pictures + pip --version + pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip list + shell: bash + + - name: Make Documentation + working-directory: ./docs/source-${{ matrix.pkg }} + run: | + # ToDo: rather use python cmd + # First run the same pipeline as Read-The-Docs + make html --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going" + + - name: Upload built docs + uses: actions/upload-artifact@v3 + with: + name: docs-${{ matrix.pkg }}-${{ github.sha }} + path: docs/build/html/ + # Use always() to always run this step to publish test results when there are test failures + if: success() From c7cdb7c885c91a951a2164a3aa5680064de74355 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 20:15:53 +0900 Subject: [PATCH 45/50] Revert unrelated changes --- .github/workflows/ci-pytorch-test-full.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 783a25aacf63f..9de97b4d24a28 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -156,12 +156,12 @@ jobs: # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - name: Upload pytest log file - if: always() + - name: Upload pytest results + if: failure() uses: actions/upload-artifact@v3 with: - name: pytest-logs-pl - path: tests/tests_pytorch/log-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.json + name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} + path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Run Examples working-directory: ./examples From 114a4c0d0301cced58d8c23ead4a33d66ec09f5a Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 20:19:38 +0900 Subject: [PATCH 46/50] Add the combination of ex-conda jobs --- .github/workflows/ci-pytorch-test-full.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 9de97b4d24a28..2610b590d5973 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -32,8 +32,10 @@ jobs: include: - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} - - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} - - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} + - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.9"} # ex-conda + - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} # ex-conda + - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} # ex-conda + - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.12"} # ex-conda - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} From cbda120fb1eaaa1380407eec7adbfa4e7fee373e Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 20:38:23 +0900 Subject: [PATCH 47/50] Update checkgroup --- .github/checkgroup.yml | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 1ad9b7abb8746..05825847a9242 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -33,8 +33,10 @@ subprojects: # Note: updates here should be applied to the lightning_lite group - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.9)" - "pl-cpu (ubuntu-20.04 3.8, 1.10)" - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.9, 1.12)" - "pl-cpu (ubuntu-20.04 3.10, 1.12)" - "pl-cpu (macos-11, 3.7, 1.9, oldest)" - "pl-cpu (macos-11, 3.7, 1.12)" @@ -64,8 +66,10 @@ subprojects: checks: - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.9)" - "pl-cpu (ubuntu-20.04 3.8, 1.10)" - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.9, 1.12)" - "pl-cpu (ubuntu-20.04 3.10, 1.12)" - "pl-cpu (macos-11, 3.7, 1.9, oldest)" - "pl-cpu (macos-11, 3.7, 1.12)" @@ -161,15 +165,23 @@ subprojects: - "lightning-lite (GPUs)" - "mypy" # Lite also requires PL checks as it depends on Lite - - "pl-cpu (macOS-11, 3.10, latest, stable)" - - "pl-cpu (macOS-11, 3.7, latest, stable)" - - "pl-cpu (macOS-11, 3.7, oldest, stable)" - - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "pl-cpu (windows-2022, 3.10, latest, stable)" - - "pl-cpu (windows-2022, 3.7, latest, stable)" - - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" + - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.9)" + - "pl-cpu (ubuntu-20.04 3.8, 1.10)" + - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.9, 1.12)" + - "pl-cpu (ubuntu-20.04 3.10, 1.12)" + - "pl-cpu (macos-11, 3.7, 1.9, oldest)" + - "pl-cpu (macos-11, 3.7, 1.12)" + - "pl-cpu (macos-11, 3.8, 1.10)" + - "pl-cpu (macos-11, 3.9, 1.11)" + - "pl-cpu (macos-11, 3.10, 1.12)" + - "pl-cpu (windows-2022, 3.7, 1.9, oldest)" + - "pl-cpu (windows-2022, 3.7, 1.12)" + - "pl-cpu (windows-2022, 3.8, 1.10)" + - "pl-cpu (windows-2022, 3.9, 1.11)" + - "pl-cpu (windows-2022, 3.10, 1.12)" - "make-doctest (pytorch)" - "make-html (pytorch)" - "pytorch-lightning (GPUs)" From 7d7e84e912f2cb9b067d77e160eacfad63bdfb84 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 21:12:39 +0900 Subject: [PATCH 48/50] revert timeout --- .github/workflows/ci-pytorch-test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 2610b590d5973..2be842d16dfac 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -50,7 +50,7 @@ jobs: # TODO: re-enable RC testing # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} - timeout-minutes: 100 + timeout-minutes: 40 steps: - uses: actions/checkout@v3 From c66ab148b5f8e96075dacfad929068f321dc94e4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 21:12:48 +0900 Subject: [PATCH 49/50] remove conda job --- .github/workflows/ci-pytorch-test-conda.yml | 110 -------------------- 1 file changed, 110 deletions(-) delete mode 100644 .github/workflows/ci-pytorch-test-conda.yml diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml deleted file mode 100644 index 0f40d074dc2bb..0000000000000 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ /dev/null @@ -1,110 +0,0 @@ -name: Test PyTorch with Conda - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - paths: - - ".github/workflows/ci-pytorch-test-conda.yml" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -defaults: - run: - shell: bash -l {0} - -jobs: - pl-conda: - runs-on: ubuntu-20.04 - container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} - strategy: - fail-fast: false - matrix: - include: - - {python-version: "3.8", pytorch-version: "1.9"} - - {python-version: "3.8", pytorch-version: "1.10"} - - {python-version: "3.9", pytorch-version: "1.11"} - - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 40 - - steps: - - name: Workaround for https://github.com/actions/checkout/issues/760 - run: git config --global --add safe.directory /__w/lightning/lightning - - - uses: actions/checkout@v3 - - - name: Update base dependencies - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - run: | - conda info - conda list - pip install -e .[test] - - - name: Freeze PIL (hotfix) - # import of PILLOW_VERSION which they recently removed in v9.0 in favor of __version__ - run: pip install "Pillow<9.0" # It messes with torchvision - - - name: DocTests - working-directory: ./src - run: pytest pytorch_lightning --cov=pytorch_lightning - - - name: Update all dependencies - env: - HOROVOD_BUILD_ARCH_FLAGS: "-mfma" - HOROVOD_WITHOUT_MXNET: 1 - HOROVOD_WITHOUT_TENSORFLOW: 1 - run: | - set -e - pip list - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --find-links https://download.pytorch.org/whl/torch_stable.html - # set a per-test timeout of 2.5 minutes to fail sooner; this aids with hanging tests - pip install pytest-timeout - pip list - # sanity check - python requirements/pytorch/check-avail-extras.py - - - name: Pull legacy checkpoints - run: bash .actions/pull_legacy_checkpoints.sh - - - name: Testing PyTorch - working-directory: tests/tests_pytorch - run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - - - name: Upload pytest results - uses: actions/upload-artifact@v3 - with: - name: unittest-results-${{ runner.os }}-torch${{ matrix.pytorch-version }} - path: tests/tests_pytorch/results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - if: failure() - - - name: Statistics - if: success() - working-directory: tests/tests_pytorch - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - if: success() - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_pytorch/coverage.xml - flags: cpu,pytest,torch${{ matrix.pytorch-version }} - name: CPU-coverage - fail_ci_if_error: false From e79855f74128d83cac40b590ba15dd4a795cfab6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 29 Sep 2022 21:17:25 +0900 Subject: [PATCH 50/50] revert docker build workflow file --- .github/workflows/ci-pytorch-dockers.yml | 221 +++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 .github/workflows/ci-pytorch-dockers.yml diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml new file mode 100644 index 0000000000000..2e9296c3df728 --- /dev/null +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -0,0 +1,221 @@ +name: Docker + +on: + push: + branches: [master, "release/*"] + pull_request: + branches: [master, "release/*"] + paths: + - "dockers/**" + - "!dockers/README.md" + - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" + - "environment.yml" + - ".github/workflows/*docker*.yml" + - "setup.py" + schedule: + - cron: "0 0 * * *" # at the end of every day + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} + +env: + PUSH_TO_HUB: ${{ github.event_name == 'schedule' }} + +jobs: + build-pl: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + # We only release one docker image per PyTorch version. + # The matrix here is the same as the one in release-docker.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + steps: + - uses: actions/checkout@v3 + - uses: docker/setup-buildx-action@v2 + - uses: docker/build-push-action@v3 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + file: dockers/release/Dockerfile + push: false # pushed in release-docker.yml only when PL is released + timeout-minutes: 50 + + build-xla: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + # the config used in '.circleci/config.yml`' + python_version: ["3.7"] + xla_version: ["1.12"] + steps: + - uses: actions/checkout@v3 + - uses: docker/setup-buildx-action@v2 + - uses: docker/login-action@v2 + if: env.PUSH_TO_HUB == 'true' + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + - uses: docker/build-push-action@v3 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + XLA_VERSION=${{ matrix.xla_version }} + file: dockers/base-xla/Dockerfile + push: ${{ env.PUSH_TO_HUB }} + tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} + timeout-minutes: 60 + - uses: ravsamhq/notify-slack-action@v2 + if: failure() && env.PUSH_TO_HUB == 'true' + with: + status: ${{ job.status }} + token: ${{ secrets.GITHUB_TOKEN }} + notification_title: ${{ format('XLA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.xla_version) }} + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + + build-cuda: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + # These are the base images for PL release docker images, + # so include at least all of the combinations in release-dockers.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + # Used in Lightning-AI/tutorials + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} + steps: + - uses: actions/checkout@v3 + - uses: docker/setup-buildx-action@v2 + - uses: docker/login-action@v2 + if: env.PUSH_TO_HUB == 'true' + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + - uses: docker/build-push-action@v3 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + file: dockers/base-cuda/Dockerfile + push: ${{ env.PUSH_TO_HUB }} + tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} + timeout-minutes: 95 + - uses: ravsamhq/notify-slack-action@v2 + if: failure() && env.PUSH_TO_HUB == 'true' + with: + status: ${{ job.status }} + token: ${{ secrets.GITHUB_TOKEN }} + notification_title: ${{ format('CUDA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01A5T7EY9M>' # akihironitta + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + + build-ipu: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + # the config used in 'dockers/ci-runner-ipu/Dockerfile' + - {python_version: "3.9", pytorch_version: "1.9"} + steps: + - uses: actions/checkout@v3 + - uses: docker/setup-buildx-action@v2 + - uses: docker/login-action@v2 + if: env.PUSH_TO_HUB == 'true' + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + - uses: docker/build-push-action@v3 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + file: dockers/base-ipu/Dockerfile + push: ${{ env.PUSH_TO_HUB }} + tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + timeout-minutes: 100 + - uses: docker/build-push-action@v3 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + file: dockers/ci-runner-ipu/Dockerfile + push: ${{ env.PUSH_TO_HUB }} + tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }} + timeout-minutes: 10 + - uses: ravsamhq/notify-slack-action@v2 + if: failure() && env.PUSH_TO_HUB == 'true' + with: + status: ${{ job.status }} + token: ${{ secrets.GITHUB_TOKEN }} + notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + + build-hpu: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + # the config used in 'dockers/ci-runner-hpu/Dockerfile' + - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"} + steps: + - uses: actions/checkout@v3 + - uses: docker/setup-buildx-action@v2 + - uses: docker/login-action@v2 + if: env.PUSH_TO_HUB == 'true' + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + - uses: docker/build-push-action@v3 + with: + build-args: | + DIST=latest + GAUDI_VERSION=${{ matrix.gaudi_version }} + PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }} + file: dockers/ci-runner-hpu/Dockerfile + push: ${{ env.PUSH_TO_HUB }} + tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }} + timeout-minutes: 10 + - uses: ravsamhq/notify-slack-action@v2 + if: failure() && env.PUSH_TO_HUB == 'true' + with: + status: ${{ job.status }} + token: ${{ secrets.GITHUB_TOKEN }} + notification_title: ${{ format('HPU; {0} py{1} for *{2}*', runner.os, matrix.gaudi_version, matrix.pytorch_version) }} + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U02PV6CL144> <@U0355SJN6HK>' # arao & Mythravarun N R + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + + build-NGC: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Build Conda Docker + # publish master/release + uses: docker/build-push-action@v3 + with: + file: dockers/nvidia/Dockerfile + push: false + timeout-minutes: 55