From 8e95af4ef4a1a1b2ada974b2a337f7048838891b Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 30 Nov 2022 16:02:47 +0100 Subject: [PATCH 01/11] debug --- .github/workflows/tpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 6ce21d0f9ebc7..c38770bdadd64 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -3,7 +3,7 @@ name: Test PyTorch - TPU on: push: branches: [master, "release/*"] - pull_request_target: + pull_request: # FIXME: use `pull_request_target` branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: From 8c6de9f4c6fdf1e57a2c4c004c44dca57e8f7a42 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 30 Nov 2022 16:03:21 +0100 Subject: [PATCH 02/11] update --- dockers/base-xla/Dockerfile | 6 +++--- environment.yml | 10 ++-------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index b8ac175044f80..267453b7d56f5 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -86,11 +86,11 @@ RUN pip --version && \ rm *.whl # Get package -COPY ./ ./pytorch-lightning/ +COPY ./ ./lightning/ RUN \ python --version && \ - cd pytorch-lightning && \ + cd lightning && \ pip install -q -r .actions/requirements.txt && \ # Pin mkl version to avoid OSError on torch import # OSError: libmkl_intel_lp64.so.1: cannot open shared object file: No such file or directory @@ -103,7 +103,7 @@ RUN \ # install PL dependencies pip install --requirement ./requirements/pytorch/devel.txt --no-cache-dir && \ cd .. && \ - rm -rf pytorch-lightning && \ + rm -rf lightning && \ rm -rf /root/.cache RUN \ diff --git a/environment.yml b/environment.yml index 330a42f581767..b05061146a72a 100644 --- a/environment.yml +++ b/environment.yml @@ -32,21 +32,15 @@ dependencies: - pytorch>=1.10.* - future>=0.17.1 - PyYAML>=5.1 - - tqdm>=4.41.0 + - tqdm>=4.57.0 - fsspec[http]>=2021.06.1 #- tensorboard>=2.2.0 # not needed, already included in pytorch # Optional #- nvidia-apex # missing for py3.8 - - scikit-learn>=0.20.0 + - scikit-learn >0.22.1 - matplotlib>=3.1.1 - omegaconf>=2.0.5 # Examples - torchvision>=0.11.* - - - pip: - - mlflow>=1.0.0 - - comet_ml>=3.1.12 - - wandb>=0.10.22 - - neptune-client>=0.10.0 From 6cde23405cc61e674d70ecf7e7b8976221f9f40e Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 30 Nov 2022 16:38:54 +0100 Subject: [PATCH 03/11] python --- .github/workflows/tpu-tests.yml | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index c38770bdadd64..e2381c5633a12 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -30,6 +30,10 @@ env: GKE_CLUSTER: lightning-cluster GKE_ZONE: us-central1-a +defaults: + run: + shell: bash + jobs: # TODO: package parametrization test-on-tpus: @@ -68,10 +72,19 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} SHA: ${{ github.event.pull_request.head.sha }} run: | - python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') - data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" - cat dockers/base-xla/tpu_workflow.jsonnet - shell: bash + import os + fname = 'dockers/base-xla/tpu_workflow.jsonnet' + with open(fname) as fo: + data = fo.read() + data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER")) + data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER")) + data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER")) + data = data.replace('{SHA}', os.getenv("SHA")) + with open(fname, "w") as fw: + fw.write(data) + shell: python + - name: Show jsonnet + run: cat dockers/base-xla/tpu_workflow.jsonnet - uses: google-github-actions/auth@v1 with: From da205ca104486dfa128e11d23796b2ec6369d1d2 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 30 Nov 2022 16:57:06 +0100 Subject: [PATCH 04/11] param --- .github/workflows/tpu-tests.yml | 13 ++-- dockers/base-xla/tpu_workflow_lite.jsonnet | 63 +++++++++++++++++++ ...w.jsonnet => tpu_workflow_pytorch.jsonnet} | 11 +--- 3 files changed, 73 insertions(+), 14 deletions(-) create mode 100644 dockers/base-xla/tpu_workflow_lite.jsonnet rename dockers/base-xla/{tpu_workflow.jsonnet => tpu_workflow_pytorch.jsonnet} (78%) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index e2381c5633a12..171d5450bd7ca 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -35,12 +35,16 @@ defaults: shell: bash jobs: - # TODO: package parametrization test-on-tpus: runs-on: ubuntu-22.04 if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 + strategy: + fail-fast: false + matrix: + # TODO: add also lightning + pkg-name: ["lite", "pytorch"] timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet` steps: @@ -68,12 +72,13 @@ jobs: - name: Update jsonnet env: + SCOPE: ${{ matrix.pkg-name }} XLA_VER: 1.12 PR_NUMBER: ${{ github.event.pull_request.number }} SHA: ${{ github.event.pull_request.head.sha }} run: | import os - fname = 'dockers/base-xla/tpu_workflow.jsonnet' + fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet' with open(fname) as fo: data = fo.read() data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER")) @@ -84,7 +89,7 @@ jobs: fw.write(data) shell: python - name: Show jsonnet - run: cat dockers/base-xla/tpu_workflow.jsonnet + run: cat dockers/base-xla/tpu_workflow_{{ matrix.pkg-name }}.jsonnet - uses: google-github-actions/auth@v1 with: @@ -99,7 +104,7 @@ jobs: - name: Deploy cluster run: | export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -) + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_{{ matrix.pkg-name }}.jsonnet | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') diff --git a/dockers/base-xla/tpu_workflow_lite.jsonnet b/dockers/base-xla/tpu_workflow_lite.jsonnet new file mode 100644 index 0000000000000..170cc3cefbd64 --- /dev/null +++ b/dockers/base-xla/tpu_workflow_lite.jsonnet @@ -0,0 +1,63 @@ +local base = import 'templates/base.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import "templates/utils.libsonnet"; + +local tputests = base.BaseTest { + frameworkPrefix: 'pl', + modelName: 'tpu-tests', + mode: 'postsubmit', + configMaps: [], + + timeout: 6000, # 100 minutes, in seconds. + + image: 'pytorchlightning/pytorch_lightning', + imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}', + + tpuSettings+: { + softwareVersion: 'pytorch-{PYTORCH_VERSION}', + }, + accelerator: tpus.v3_8, + + command: utils.scriptCommand( + ||| + set +x # turn off tracing, spammy + set -e # exit on error + + source ~/.bashrc + conda activate lightning + + echo "--- Fetch the SHA's changes ---" + git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git + cd lightning + if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty + # PR triggered it, check it out + git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER} + git -c advice.detachedHead=false checkout {SHA} + fi + + echo "--- Install packages ---" + PACKAGE_NAME=lite pip install .[dev] + pip list + + echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS + export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" + + echo "--- Sanity check TPU availability ---" + python -c "from lightning_lite.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()" + echo "Sanity check passed!" + + echo "--- Running Lite tests ---" + cd tests/tests_lite + PL_RUN_TPU_TESTS=1 coverage run --source=lightning_lite -m pytest -vv --durations=0 ./ + + echo "--- Running standalone Lite tests ---" + PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh + + echo "--- Generating coverage ---" + coverage xml + cat coverage.xml | tr -d '\t' + ||| + ), +}; + +tputests.oneshotJob diff --git a/dockers/base-xla/tpu_workflow.jsonnet b/dockers/base-xla/tpu_workflow_pytorch.jsonnet similarity index 78% rename from dockers/base-xla/tpu_workflow.jsonnet rename to dockers/base-xla/tpu_workflow_pytorch.jsonnet index 1d006fec89b6a..6d3ba1f25b79c 100644 --- a/dockers/base-xla/tpu_workflow.jsonnet +++ b/dockers/base-xla/tpu_workflow_pytorch.jsonnet @@ -36,25 +36,16 @@ local tputests = base.BaseTest { fi echo "--- Install packages ---" - PACKAGE_NAME=lite pip install -e .[dev] - PACKAGE_NAME=pytorch pip install -e .[dev] + PACKAGE_NAME=pytorch pip install .[dev] pip list echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" echo "--- Sanity check TPU availability ---" - python -c "from lightning_lite.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()" python -c "from pytorch_lightning.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()" echo "Sanity check passed!" - echo "--- Running Lite tests ---" - cd tests/tests_lite - PL_RUN_TPU_TESTS=1 coverage run --source=lightning_lite -m pytest -vv --durations=0 ./ - - echo "--- Running standalone Lite tests ---" - PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh - echo "--- Running PL tests ---" cd ../tests_pytorch PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ From 240dcb0447be416a368518b1d5115050140e604c Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 30 Nov 2022 16:58:57 +0100 Subject: [PATCH 05/11] $ --- .github/workflows/tpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 171d5450bd7ca..98da6410a1475 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -89,7 +89,7 @@ jobs: fw.write(data) shell: python - name: Show jsonnet - run: cat dockers/base-xla/tpu_workflow_{{ matrix.pkg-name }}.jsonnet + run: cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet - uses: google-github-actions/auth@v1 with: @@ -104,7 +104,7 @@ jobs: - name: Deploy cluster run: | export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_{{ matrix.pkg-name }}.jsonnet | kubectl create -f -) + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') From 16ce4d08db1cedf5edcde313516d85c8f771866a Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 30 Nov 2022 17:12:23 +0100 Subject: [PATCH 06/11] checkout --- dockers/base-xla/tpu_workflow_lite.jsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dockers/base-xla/tpu_workflow_lite.jsonnet b/dockers/base-xla/tpu_workflow_lite.jsonnet index 170cc3cefbd64..663313ebc7b89 100644 --- a/dockers/base-xla/tpu_workflow_lite.jsonnet +++ b/dockers/base-xla/tpu_workflow_lite.jsonnet @@ -27,12 +27,12 @@ local tputests = base.BaseTest { conda activate lightning echo "--- Fetch the SHA's changes ---" - git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git + git clone --single-branch https://github.com/Lightning-AI/lightning.git cd lightning if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty # PR triggered it, check it out - git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER} - git -c advice.detachedHead=false checkout {SHA} + git fetch origin pull/{PR_NUMBER}/head:ACTUAL_PR + git checkout ACTUAL_PR fi echo "--- Install packages ---" From a771df40ea717834a90be0561676c396172872ee Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 30 Nov 2022 18:07:20 +0100 Subject: [PATCH 07/11] echo --- dockers/base-xla/tpu_workflow_lite.jsonnet | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dockers/base-xla/tpu_workflow_lite.jsonnet b/dockers/base-xla/tpu_workflow_lite.jsonnet index 663313ebc7b89..85171f39669e4 100644 --- a/dockers/base-xla/tpu_workflow_lite.jsonnet +++ b/dockers/base-xla/tpu_workflow_lite.jsonnet @@ -26,12 +26,14 @@ local tputests = base.BaseTest { source ~/.bashrc conda activate lightning - echo "--- Fetch the SHA's changes ---" + echo "--- Cloning lightning repo ---" git clone --single-branch https://github.com/Lightning-AI/lightning.git cd lightning + # PR triggered it, check it out if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty - # PR triggered it, check it out + echo "--- Fetch the PR changes ---" git fetch origin pull/{PR_NUMBER}/head:ACTUAL_PR + echo "--- Checkout PR changes ---" git checkout ACTUAL_PR fi From 7bf069f004e05d5a96703fc04e412fa22a0ccaa9 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 1 Dec 2022 04:34:55 +0100 Subject: [PATCH 08/11] 1 --- .github/workflows/tpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 98da6410a1475..d438afba4032a 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -42,6 +42,7 @@ jobs: PYTHON_VER: 3.7 strategy: fail-fast: false + max-parallel: 1 # run sequential matrix: # TODO: add also lightning pkg-name: ["lite", "pytorch"] From 6d8da8c5756856e8e7a47035f2fc386ba8cee665 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 2 Dec 2022 11:09:47 +0100 Subject: [PATCH 09/11] revert --- dockers/base-xla/tpu_workflow_lite.jsonnet | 6 +++--- dockers/base-xla/tpu_workflow_pytorch.jsonnet | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/dockers/base-xla/tpu_workflow_lite.jsonnet b/dockers/base-xla/tpu_workflow_lite.jsonnet index 85171f39669e4..887f0f744a206 100644 --- a/dockers/base-xla/tpu_workflow_lite.jsonnet +++ b/dockers/base-xla/tpu_workflow_lite.jsonnet @@ -27,14 +27,14 @@ local tputests = base.BaseTest { conda activate lightning echo "--- Cloning lightning repo ---" - git clone --single-branch https://github.com/Lightning-AI/lightning.git + git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git cd lightning # PR triggered it, check it out if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty echo "--- Fetch the PR changes ---" - git fetch origin pull/{PR_NUMBER}/head:ACTUAL_PR + git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER} echo "--- Checkout PR changes ---" - git checkout ACTUAL_PR + git -c advice.detachedHead=false checkout {SHA} fi echo "--- Install packages ---" diff --git a/dockers/base-xla/tpu_workflow_pytorch.jsonnet b/dockers/base-xla/tpu_workflow_pytorch.jsonnet index 6d3ba1f25b79c..d6634e80757b4 100644 --- a/dockers/base-xla/tpu_workflow_pytorch.jsonnet +++ b/dockers/base-xla/tpu_workflow_pytorch.jsonnet @@ -26,12 +26,14 @@ local tputests = base.BaseTest { source ~/.bashrc conda activate lightning - echo "--- Fetch the SHA's changes ---" + echo "--- Cloning lightning repo ---" git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git cd lightning + # PR triggered it, check it out if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty - # PR triggered it, check it out + echo "--- Fetch the PR changes ---" git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER} + echo "--- Checkout PR changes ---" git -c advice.detachedHead=false checkout {SHA} fi From 0d259616c313992c557ee6dbf0f7090b3510161c Mon Sep 17 00:00:00 2001 From: Jirka Date: Sat, 3 Dec 2022 03:20:58 +0100 Subject: [PATCH 10/11] path --- dockers/base-xla/tpu_workflow_pytorch.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-xla/tpu_workflow_pytorch.jsonnet b/dockers/base-xla/tpu_workflow_pytorch.jsonnet index d6634e80757b4..5acaf5ce99f34 100644 --- a/dockers/base-xla/tpu_workflow_pytorch.jsonnet +++ b/dockers/base-xla/tpu_workflow_pytorch.jsonnet @@ -49,7 +49,7 @@ local tputests = base.BaseTest { echo "Sanity check passed!" echo "--- Running PL tests ---" - cd ../tests_pytorch + cd tests/tests_pytorch PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ echo "--- Running standalone PL tests ---" From 03749938f6e75ec73cce594be070270e89baa0b2 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Sat, 3 Dec 2022 16:58:05 +0900 Subject: [PATCH 11/11] Apply suggestions from code review --- .github/workflows/tpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index d438afba4032a..24c5fa1a205d5 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -3,7 +3,7 @@ name: Test PyTorch - TPU on: push: branches: [master, "release/*"] - pull_request: # FIXME: use `pull_request_target` + pull_request_target: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: