From 9cd332b526285b67a05ab1a68bac274e9cff3f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Sep 2022 18:15:50 +0200 Subject: [PATCH 01/25] Migrate TPU tests to GitHub actions --- .circleci/config.yml | 146 ---------------------- .github/CODEOWNERS | 1 - .github/checkgroup.yml | 2 +- .github/workflows/ci-circleci.yml | 31 ----- .github/workflows/ci-pytorch-test-tpu.yml | 116 +++++++++++++++++ 5 files changed, 117 insertions(+), 179 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 .github/workflows/ci-circleci.yml create mode 100644 .github/workflows/ci-pytorch-test-tpu.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 05c901eee0e82..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,146 +0,0 @@ -# Python CircleCI 2.1 configuration file. -version: 2.1 -orbs: - gcp-gke: circleci/gcp-gke@1.4.0 - go: circleci/go@1.7.1 - codecov: codecov/codecov@1.1.0 -parameters: - GHA_Actor: - type: string - default: "" - GHA_Action: - type: string - default: "" - GHA_Event: - type: string - default: "" - -references: - - make_docs: &make_docs - run: - name: Make Documentation - command: | - # the image uses python 2.7 by default, force a different version - pyenv global 3.7.3 - python --version - pip install -e . -r requirements/pytorch/docs.txt - pip list - cd docs - make clean - make html --jobs 2 SPHINXOPTS="-W" - - checkout_ml_testing: &checkout_ml_testing - run: - name: Checkout ml-testing-accelerators - command: | - git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git - cd ml-testing-accelerators - git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable - git checkout stable - cd .. - - install_jsonnet: &install_jsonnet - run: - name: Install jsonnet - command: | - go install github.com/google/go-jsonnet/cmd/jsonnet@latest - - update_jsonnet: &update_jsonnet - run: - name: Update jsonnet - command: | - export SHA=$(git rev-parse --short HEAD) - export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f $SHA | awk -F'/' '{print $3}') - python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') - data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" - cat dockers/tpu-tests/tpu_test_cases.jsonnet - - deploy_cluster: &deploy_cluster - run: - name: Deploy the job on the kubernetes cluster - command: | - export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \ - job_name=${job_name#job.batch/} - job_name=${job_name% created} - pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') - echo "GKE pod name: $pod_name" - echo "Waiting on kubernetes job: $job_name" - i=0 && \ - # N checks spaced 30s apart = 900s total. - status_code=2 && \ - # Check on the job periodically. Set the status code depending on what - # happened to the job in Kubernetes. If we try MAX_CHECKS times and - # still the job hasn't finished, give up and return the starting - # non-zero status code. - printf "Waiting for job to finish: " && \ - while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SLEEP; done && \ - echo "Done waiting. Job status code: $status_code" && \ - kubectl logs -f $pod_name --container=train > /tmp/full_output.txt - if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ - # First portion is the test logs. Print these to Github Action stdout. - cat xx00 && \ - echo "Done with log retrieval attempt." && \ - exit $status_code - - stats: &stats - run: - name: Statistics - command: | - mv ./xx01 coverage.xml - -jobs: - - TPU-tests: - docker: - - image: circleci/python:3.7 - environment: - - XLA_VER: 1.12 - - PYTHON_VER: 3.7 - - MAX_CHECKS: 1000 - - CHECK_SLEEP: 5 - steps: - - checkout - - go/install - - *checkout_ml_testing - - gcp-gke/install - - gcp-gke/update-kubeconfig-with-credentials: - cluster: $GKE_CLUSTER - perform-login: true - - *install_jsonnet - - *update_jsonnet - - *deploy_cluster - - *stats - - codecov/upload: - file: coverage.xml - flags: tpu,pytest - upload_name: TPU-coverage - - - store_artifacts: - path: coverage.xml - - build-Docs: - docker: - - image: readthedocs/build:latest - steps: - - checkout - - run: - command: | - git submodule update --init --recursive - name: Init git submodule - - *make_docs - - store_artifacts: - # allows us to preview the generated html pages - path: docs/build/html/ - destination: html - -workflows: - #build-docs: # FixMe - # when: << pipeline.parameters.GHA_Action >> - # jobs: - # - build-Docs - test-on-tpus: - when: << pipeline.parameters.GHA_Action >> - jobs: - - TPU-tests diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2460e2a71d761..4ee6d7586b343 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,7 +9,6 @@ # CI/CD and configs /.github/ @borda @carmocca @akihironitta @tchaton -/.circleci/ @borda @carmocca @akihironitta @tchaton /.azure/ @borda @carmocca @akihironitta @tchaton /dockers/ @borda @carmocca @akihironitta @tchaton *.yml @borda @carmocca @akihironitta @tchaton diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 6acef1517738c..4f951ce3216d7 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -9,7 +9,7 @@ subprojects: - id: "CI: CircleCI" paths: - ".circleci/**" - - ".github/workflows/ci-circleci.yml" + - ".github/workflows/ci-pytorch-test-tpu.yml" checks: - "test-on-tpus" diff --git a/.github/workflows/ci-circleci.yml b/.github/workflows/ci-circleci.yml deleted file mode 100644 index d1ff85e45e0dd..0000000000000 --- a/.github/workflows/ci-circleci.yml +++ /dev/null @@ -1,31 +0,0 @@ -on: - push: - branches: [master, "release/*"] - paths: - - ".github/workflows/ci-circleci.yml" - - ".circleci/config.yml" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - pull_request_target: - branches: [master, "release/*"] - paths: - - ".github/workflows/ci-circleci.yml" - - ".circleci/config.yml" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - -jobs: - # https://github.com/marketplace/actions/trigger-circleci-pipeline - trigger-circleci: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - - uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.0.5 - env: - CCI_TOKEN: ${{ secrets.CCI_TOKEN }} diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml new file mode 100644 index 0000000000000..64863eb6064c3 --- /dev/null +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -0,0 +1,116 @@ +on: + push: + branches: [master, "release/*"] + paths: + - ".github/workflows/ci-pytorch-test-tpu.yml" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + pull_request: + branches: [master, "release/*"] + types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped + paths: + - ".github/workflows/ci-pytorch-test-tpu.yml" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} + +jobs: + test-on-tpus: + runs-on: ubuntu-22.04 + if: github.event.pull_request.draft == false + env: + PYTHON_VER: 3.7 + CHECK_SLEEP: 5 + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: $PYTHON_VER + + - name: Checkout ml-testing-accelerators + run: | + git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git + cd ml-testing-accelerators + git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable + git checkout stable + + - name: Install jsonnet + uses: actions/setup-go@v3 + with: + go-version: '1.7.1' + run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest + + - name: Update jsonnet + env: + XLA_VER: 1.12 + PR_NUMBER: ${{ github.event.number }} + SHA: ${{ github.sha }} + run: | + python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') + data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" + cat dockers/tpu-tests/tpu_test_cases.jsonnet + shell: bash + + - name: Deploy cluster + run: | + export PATH=$PATH:$HOME/go/bin + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) + job_name=${job_name#job.batch/} + job_name=${job_name% created} + pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') + echo "GKE pod name: $pod_name" + echo "Waiting on kubernetes job: $job_name" + status_code=2 && + # Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes. + printf "Waiting for job to finish: " + while true; do + if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then + status_code=1 && break; + elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then + status_code=0 && break; + else + printf "."; + fi; + sleep $CHECK_SLEEP; + done + echo "Done waiting. Job status code: $status_code" + kubectl logs -f $pod_name --container=train > /tmp/full_output.txt + if grep -q '' /tmp/full_output.txt; then + csplit /tmp/full_output.txt '//'; + else + mv /tmp/full_output.txt xx00; + fi + # First portion is the test logs. + cat xx00 && echo "Done with log retrieval attempt." + exit $status_code + shell: bash + + - name: Statistics + if: success() + working-directory: tests/tests_pytorch + run: | + mv ./xx01 coverage.xml + coverage report + coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + if: always() + # see: https://github.com/actions/toolkit/issues/399 + continue-on-error: true + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: tests/tests_pytorch/coverage.xml + flags: tpu,pytest,python${{ matrix.python-version }} + name: TPU-coverage + fail_ci_if_error: false From 9eac22e1d9363b6ee217706c3caaf66173a50732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Sep 2022 18:18:36 +0200 Subject: [PATCH 02/25] No working dir --- .github/workflows/ci-pytorch-test-tpu.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 64863eb6064c3..02605d1f0b476 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -97,7 +97,6 @@ jobs: - name: Statistics if: success() - working-directory: tests/tests_pytorch run: | mv ./xx01 coverage.xml coverage report @@ -110,7 +109,7 @@ jobs: continue-on-error: true with: token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_pytorch/coverage.xml - flags: tpu,pytest,python${{ matrix.python-version }} + file: coverage.xml + flags: tpu,pytest,python$PYTHON_VER name: TPU-coverage fail_ci_if_error: false From cafc78e6e6cbd573aaf054819db8912dda87b21f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Sep 2022 18:27:01 +0200 Subject: [PATCH 03/25] Keep _target --- .github/workflows/ci-pytorch-test-tpu.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 02605d1f0b476..e232a3870bfc7 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -7,7 +7,7 @@ on: - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - pull_request: + pull_request_target: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped paths: @@ -32,6 +32,8 @@ jobs: steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} - uses: actions/setup-python@v4 with: @@ -53,14 +55,21 @@ jobs: - name: Update jsonnet env: XLA_VER: 1.12 - PR_NUMBER: ${{ github.event.number }} - SHA: ${{ github.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA: ${{ github.event.pull_request.head.sha }} run: | python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" cat dockers/tpu-tests/tpu_test_cases.jsonnet shell: bash + # https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine + - uses: google-github-actions/get-gke-credentials@fb08709ba27618c31c09e014e1d8364b02e5042e + with: + cluster_name: TODO + location: us-central1-c + credentials: ${{ secrets.GKE_SA_KEY }} + - name: Deploy cluster run: | export PATH=$PATH:$HOME/go/bin From 29b915cf3f51aa379f678506da956a29805c66b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Sep 2022 18:29:25 +0200 Subject: [PATCH 04/25] Dont skip draft --- .github/workflows/ci-pytorch-test-tpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index e232a3870bfc7..674ca51a889e9 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -24,7 +24,8 @@ concurrency: jobs: test-on-tpus: runs-on: ubuntu-22.04 - if: github.event.pull_request.draft == false + # FIXME: uncomment after finished + # if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 CHECK_SLEEP: 5 From 822bcc40f0121653cae33fae39fd63f99ff673ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Sep 2022 18:30:14 +0200 Subject: [PATCH 05/25] CHECK_SLEEP --- .github/workflows/ci-pytorch-test-tpu.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 674ca51a889e9..7ab5c1ab02adc 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -28,7 +28,6 @@ jobs: # if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 - CHECK_SLEEP: 5 timeout-minutes: 60 steps: @@ -91,7 +90,7 @@ jobs: else printf "."; fi; - sleep $CHECK_SLEEP; + sleep 5; done echo "Done waiting. Job status code: $status_code" kubectl logs -f $pod_name --container=train > /tmp/full_output.txt From 94adbf861c1ce8c42b38daf1cb4d298199ee09ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Sep 2022 18:33:22 +0200 Subject: [PATCH 06/25] Not yet --- .github/workflows/ci-pytorch-test-tpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 7ab5c1ab02adc..a31b0203e4081 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -7,7 +7,8 @@ on: - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - pull_request_target: + # FIXME: use _target after merge to share secrets + pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped paths: From 8e79c5801ae2bfff63adae9583bb0f4b09e91821 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 02:45:19 +0200 Subject: [PATCH 07/25] Remove recurrent cleanup script --- .github/workflows/events-recurrent.yml | 43 -------------------------- 1 file changed, 43 deletions(-) delete mode 100644 .github/workflows/events-recurrent.yml diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml deleted file mode 100644 index 0a8ff4e006792..0000000000000 --- a/.github/workflows/events-recurrent.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: Recurrent - -# https://jasonet.co/posts/scheduled-actions/ -# https://github.community/t/distinct-job-for-each-schedule/17811/2 -on: - push: - branches: [ master ] - schedule: - - cron: "*/20 * * * *" # At every 20 minutes - -env: - GKE_CLUSTER: lightning-cluster - GKE_ZONE: us-central1-a - -jobs: - tpu-cleanup: - name: TPU cleaning - if: ${{ github.repository_owner == 'Lightning-AI' }} - runs-on: ubuntu-20.04 - - steps: - - name: Setup gcloud CLI - uses: google-github-actions/setup-gcloud@v0 - with: - version: '290.0.1' - service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }} - project_id: ${{ secrets.GKE_PROJECT }} - export_default_credentials: true - # Get the GKE credentials so we can deploy to the cluster; Use either zone or region depending on cluster setup. - - run: |- - gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE" - shell: bash - - - name: Clean all mong hanging jobs - run: | - # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job - # that has been around longer than 1hr. First print all columns for - # matches, then execute the delete. - jobs_to_delete=$(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}') - echo $jobs_to_delete - if [ ${#jobs_to_delete} -gt 1 ]; - then kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}'); - fi From 86d7e6ff626de0f295b31922ee34e40f7bdeb2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 02:51:29 +0200 Subject: [PATCH 08/25] Set secrets --- .github/workflows/ci-pytorch-test-tpu.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index a31b0203e4081..928d3bc5f7f8b 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -22,6 +22,11 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} +env: + PROJECT_ID: ${{ secrets.GKE_PROJECT }} + GKE_CLUSTER: lightning-cluster + GKE_ZONE: us-central1-a + jobs: test-on-tpus: runs-on: ubuntu-22.04 @@ -67,9 +72,9 @@ jobs: # https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine - uses: google-github-actions/get-gke-credentials@fb08709ba27618c31c09e014e1d8364b02e5042e with: - cluster_name: TODO - location: us-central1-c - credentials: ${{ secrets.GKE_SA_KEY }} + cluster_name: ${{ env.GKE_CLUSTER }} + location: ${{ env.GKE_ZONE }} + credentials: ${{ secrets.GKE_SA_KEY_BASE64 }} - name: Deploy cluster run: | From 477b565d9b2a43f98c5a8fba3d5777b92925f8ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 03:09:14 +0200 Subject: [PATCH 09/25] a step cannot have both the `uses` and `run` keys --- .github/workflows/ci-pytorch-test-tpu.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 928d3bc5f7f8b..5924b366118c4 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -52,10 +52,11 @@ jobs: git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable git checkout stable - - name: Install jsonnet - uses: actions/setup-go@v3 + - uses: actions/setup-go@v3 with: go-version: '1.7.1' + + - name: Install jsonnet run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest - name: Update jsonnet From 4c2091cbdfac10b2782b14abcebab86ba5b32538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 03:13:46 +0200 Subject: [PATCH 10/25] Version $PYTHON_VER was not found in the local cache --- .github/workflows/ci-pytorch-test-tpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 5924b366118c4..f972ee2ccb845 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -43,7 +43,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: $PYTHON_VER + python-version: ${{ env.PYTHON_VER }} - name: Checkout ml-testing-accelerators run: | @@ -126,6 +126,6 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} file: coverage.xml - flags: tpu,pytest,python$PYTHON_VER + flags: tpu,pytest,python${{ env.PYTHON_VER }} name: TPU-coverage fail_ci_if_error: false From 8d750e383b94b700a9d7ccba932782f5867081a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 03:30:30 +0200 Subject: [PATCH 11/25] can't load package ... ($GOPATH not set) --- .github/workflows/ci-pytorch-test-tpu.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index f972ee2ccb845..390d88f2a49cd 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -56,6 +56,14 @@ jobs: with: go-version: '1.7.1' + # fix for "can't load package ... ($GOPATH not set)" + # https://github.com/actions/setup-go/issues/12#issuecomment-583361333 + - name: Set GOPATH + run: | + echo "##[set-env name=GOPATH;]$(dirname $GITHUB_WORKSPACE)" + echo "##[add-path]$(dirname $GITHUB_WORKSPACE)/bin" + shell: bash + - name: Install jsonnet run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest From 77e9a70841564e10915c660f17dcb297b22cbf1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 03:33:57 +0200 Subject: [PATCH 12/25] The `set-env` command is disabled --- .github/workflows/ci-pytorch-test-tpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 390d88f2a49cd..8255e4eb44003 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -60,8 +60,8 @@ jobs: # https://github.com/actions/setup-go/issues/12#issuecomment-583361333 - name: Set GOPATH run: | - echo "##[set-env name=GOPATH;]$(dirname $GITHUB_WORKSPACE)" - echo "##[add-path]$(dirname $GITHUB_WORKSPACE)/bin" + echo "GOPATH=$(dirname $GITHUB_WORKSPACE)" >> $GITHUB_ENV + echo "$(dirname $GITHUB_WORKSPACE)/bin" >> $GITHUB_PATH shell: bash - name: Install jsonnet From c54d082146ed0396628b6ef9d8a3c81158022fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 16:10:12 +0200 Subject: [PATCH 13/25] Try updating go --- .github/workflows/ci-pytorch-test-tpu.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 8255e4eb44003..66c6dd0ee0ec3 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -54,15 +54,15 @@ jobs: - uses: actions/setup-go@v3 with: - go-version: '1.7.1' + go-version: '1.19' # fix for "can't load package ... ($GOPATH not set)" # https://github.com/actions/setup-go/issues/12#issuecomment-583361333 - - name: Set GOPATH - run: | - echo "GOPATH=$(dirname $GITHUB_WORKSPACE)" >> $GITHUB_ENV - echo "$(dirname $GITHUB_WORKSPACE)/bin" >> $GITHUB_PATH - shell: bash + #- name: Set GOPATH + # run: | + # echo "GOPATH=$(dirname $GITHUB_WORKSPACE)" >> $GITHUB_ENV + # echo "$(dirname $GITHUB_WORKSPACE)/bin" >> $GITHUB_PATH + # shell: bash - name: Install jsonnet run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest From 18182a2b8711423f2a49d85ab856284a3e275bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 30 Sep 2022 16:40:03 +0200 Subject: [PATCH 14/25] Match timeout --- .github/workflows/ci-pytorch-test-tpu.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 66c6dd0ee0ec3..032cb2ce24a00 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -34,7 +34,7 @@ jobs: # if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 - timeout-minutes: 60 + timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` steps: - uses: actions/checkout@v3 @@ -56,14 +56,6 @@ jobs: with: go-version: '1.19' - # fix for "can't load package ... ($GOPATH not set)" - # https://github.com/actions/setup-go/issues/12#issuecomment-583361333 - #- name: Set GOPATH - # run: | - # echo "GOPATH=$(dirname $GITHUB_WORKSPACE)" >> $GITHUB_ENV - # echo "$(dirname $GITHUB_WORKSPACE)/bin" >> $GITHUB_PATH - # shell: bash - - name: Install jsonnet run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest From 8685e2aa464904847e2432694de683e93ce259bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 3 Oct 2022 18:23:19 +0200 Subject: [PATCH 15/25] simplify path --- dockers/tpu-tests/tpu_test_cases.jsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 43ab3ab2559d5..a9d489ca82eaf 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -27,8 +27,8 @@ local tputests = base.BaseTest { conda activate lightning echo "--- Fetch the SHA's changes ---" - git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git /home/runner/work/lightning - cd home/runner/work/lightning + git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git + cd lightning git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER} git -c advice.detachedHead=false checkout {SHA} From 34ba453eaee1aa56034938605d1715aacd3dcc48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 5 Oct 2022 01:02:25 +0200 Subject: [PATCH 16/25] More cleanup --- .github/checkgroup.yml | 15 ++++++------ .github/workflows/README.md | 2 +- .github/workflows/ci-pytorch-dockers.yml | 2 +- .lightningignore | 1 - README.md | 31 +++++++++--------------- docs/README.md | 3 +-- src/pytorch_lightning/README.md | 31 ++++++++---------------- 7 files changed, 31 insertions(+), 54 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 3584561a4ac88..6f607096e7adc 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -6,13 +6,6 @@ subprojects: # SECTION: pytorch_lightning - - id: "CI: CircleCI" - paths: - - ".circleci/**" - - ".github/workflows/ci-pytorch-test-tpu.yml" - checks: - - "test-on-tpus" - - id: "pytorch_lightning" paths: # all examples don't need to be added because they aren't used in CI, but these are @@ -57,7 +50,7 @@ subprojects: - "pl-slow (macOS-11, 3.7, 1.11)" - "pl-slow (ubuntu-20.04, 3.7, 1.11)" - "pl-slow (windows-2022, 3.7, 1.11)" - # TODO: since this job cannot run on forks, it cannot be required or it will block all PL PRs from forks + # TODO: since this job has intermittent availability, it cannot be required or it will block all PL PRs from forks #- "test-on-tpus" - id: "pytorch_lightning: CPU" @@ -109,6 +102,12 @@ subprojects: checks: - "pytorch-lightning (IPUs)" + - id: "pytorch-lightning: TPU" + paths: + - ".github/workflows/ci-pytorch-test-tpu.yml" + checks: + - "test-on-tpus" + - id: "pytorch_lightning: Docs" paths: - "docs/source-pytorch/**" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index b9acc1c060968..d7de900913a19 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -12,7 +12,7 @@ | pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | | pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | | PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | +| test-on-tpus | .github/workflows/ci-pytorch-test-tpu.yml | Run only TPU-specific tests. | TPU | - \*Accelerators used in CI diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 2e9296c3df728..48b7399c9f178 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -55,7 +55,7 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.circleci/config.yml`' + # the config used in '.github/workflows/ci-pytorch-test-tpu.yml' python_version: ["3.7"] xla_version: ["1.12"] steps: diff --git a/.lightningignore b/.lightningignore index f3bdf641c1425..4ce8d526e30e3 100644 --- a/.lightningignore +++ b/.lightningignore @@ -1,6 +1,5 @@ _notebooks .azure -.circleci .github .ipynb_checkpoints .pytest_cache diff --git a/README.md b/README.md index 0d109db3e45e2..00b1399301a05 100644 --- a/README.md +++ b/README.md @@ -89,17 +89,15 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | -| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | -| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | -| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | -| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -137,14 +135,7 @@ pip install pytorch-lightning['extra'] conda install pytorch-lightning -c conda-forge ``` -#### Install stable 1.7.x - -The actual status of 1.7 \[stable\] is the following: - -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml?query=branch%3Arelease%2Fpytorch) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml?query=branch%3Arelease%2Fpytorch) -[![TPU tests](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) -[![Check Docs](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml?query=branch%3Arelease%2Fpytorch) +#### Install stable version Install future release from the source @@ -152,7 +143,7 @@ Install future release from the source pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/release/pytorch.zip -U ``` -#### Install bleeding-edge - future 1.6 +#### Install bleeding-edge Install nightly from the source (no guarantees) diff --git a/docs/README.md b/docs/README.md index 54f39a8b4fd0c..d59458b5a58ef 100644 --- a/docs/README.md +++ b/docs/README.md @@ -48,8 +48,7 @@ make docs and open `docs/build/html/index.html` in your browser. -When you send a PR the continuous integration will run tests and build the docs. You can access a preview of the html pages in the -_Artifacts_ tab in CircleCI when you click on the task named _build-Docs_ of _ci-tests_ at the bottom of the PR page. +When you send a PR the continuous integration will run tests and build the docs. Notes: diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index 914596c0a9d2f..9bf50343aefcf 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -78,17 +78,15 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | -| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | -| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | -| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | -| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -128,22 +126,13 @@ conda install pytorch-lightning -c conda-forge #### Install stable version -The actual status of stable is the following: - -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) -[![GPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=release%2Fpytorch) -[![TPU](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) -[![IPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=release%2Fpytorch) -[![HPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=release%2Fpytorch) - Install future release from the source ```bash pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/release/pytorch.zip -U ``` -#### Install bleeding-edge - future 1.7 +#### Install bleeding-edge Install nightly from the source (no guarantees) From 1773d3fb6b9c646ad68076e13e69b88858052943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sat, 8 Oct 2022 12:02:01 +0200 Subject: [PATCH 17/25] Install coverage. Unmark draft --- .github/workflows/ci-pytorch-test-tpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 032cb2ce24a00..44ea8cd20bea1 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -30,8 +30,7 @@ env: jobs: test-on-tpus: runs-on: ubuntu-22.04 - # FIXME: uncomment after finished - # if: github.event.pull_request.draft == false + if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` @@ -115,6 +114,7 @@ jobs: if: success() run: | mv ./xx01 coverage.xml + pip install coverage -q coverage report coverage xml From 1cd4ee2022326a34ff45b92660e66891c1eb867d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sat, 8 Oct 2022 12:02:48 +0200 Subject: [PATCH 18/25] Update .github/workflows/ci-pytorch-test-tpu.yml --- .github/workflows/ci-pytorch-test-tpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 44ea8cd20bea1..9c5fafdd5067e 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -7,7 +7,7 @@ on: - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - # FIXME: use _target after merge to share secrets + # TODO: use _target after merge to share secrets pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped From 4011856e6ea076e45fe40b942c20ee63ed7433f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 11 Oct 2022 00:36:53 +0200 Subject: [PATCH 19/25] DEBUG echo --- .github/workflows/ci-pytorch-test-tpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 9c5fafdd5067e..957fd16aee554 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -100,6 +100,7 @@ jobs: done echo "Done waiting. Job status code: $status_code" kubectl logs -f $pod_name --container=train > /tmp/full_output.txt + echo /tmp/full_output.txt if grep -q '' /tmp/full_output.txt; then csplit /tmp/full_output.txt '//'; else From 981f2d8608fc986a5be5051d217810aedef19fbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 11 Oct 2022 02:40:19 +0200 Subject: [PATCH 20/25] Revert "DEBUG echo" This reverts commit 4011856e6ea076e45fe40b942c20ee63ed7433f3. --- .github/workflows/ci-pytorch-test-tpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 957fd16aee554..28d94efdcaf79 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -100,7 +100,7 @@ jobs: done echo "Done waiting. Job status code: $status_code" kubectl logs -f $pod_name --container=train > /tmp/full_output.txt - echo /tmp/full_output.txt + cat /tmp/full_output.txt if grep -q '' /tmp/full_output.txt; then csplit /tmp/full_output.txt '//'; else From 19dd22910745bda5ab6ce254f43bd1d11d43d6e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 14 Oct 2022 00:01:43 +0200 Subject: [PATCH 21/25] More debug --- .github/workflows/ci-pytorch-test-tpu.yml | 61 +++++++++++------------ 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 28d94efdcaf79..798a637054438 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -78,43 +78,40 @@ jobs: - name: Deploy cluster run: | - export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) - job_name=${job_name#job.batch/} - job_name=${job_name% created} - pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') - echo "GKE pod name: $pod_name" - echo "Waiting on kubernetes job: $job_name" - status_code=2 && - # Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes. - printf "Waiting for job to finish: " - while true; do - if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then - status_code=1 && break; - elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then - status_code=0 && break; - else - printf "."; - fi; - sleep 5; - done - echo "Done waiting. Job status code: $status_code" - kubectl logs -f $pod_name --container=train > /tmp/full_output.txt - cat /tmp/full_output.txt - if grep -q '' /tmp/full_output.txt; then - csplit /tmp/full_output.txt '//'; - else - mv /tmp/full_output.txt xx00; - fi - # First portion is the test logs. - cat xx00 && echo "Done with log retrieval attempt." - exit $status_code + export PATH=$PATH:$HOME/go/bin + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) + job_name=${job_name#job.batch/} + job_name=${job_name% created} + pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') + echo "GKE pod name: $pod_name" + echo "Waiting on kubernetes job: $job_name" + status_code=2 && + # Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes. + printf "Waiting for job to finish: " + while true; do + if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then + status_code=1 && break; + elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then + status_code=0 && break; + else + printf "."; + fi; + sleep 5; + done + echo "Done waiting. Job status code: $status_code" + kubectl logs -f $pod_name --container=train > /tmp/full_output.txt + grep '' /tmp/full_output.txt # sanity check + csplit /tmp/full_output.txt '//' + # REVERT ME + ls + cat xx01 + exit $status_code shell: bash - name: Statistics if: success() run: | - mv ./xx01 coverage.xml + mv xx01 coverage.xml pip install coverage -q coverage report coverage xml From 1685ba583bcd0f2174993bc844ef72608af799c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 19 Oct 2022 15:53:20 +0200 Subject: [PATCH 22/25] SSH --- .github/workflows/ci-pytorch-test-tpu.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index 798a637054438..ae6471af936bf 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -33,7 +33,7 @@ jobs: if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 - timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` + #timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` steps: - uses: actions/checkout@v3 @@ -108,6 +108,9 @@ jobs: exit $status_code shell: bash + - name: Setup upterm session + uses: lhotari/action-upterm@v1 + - name: Statistics if: success() run: | From 5f4384cf6b7caa4e5e4057f2ba4a6fd396a65471 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 21 Oct 2022 01:38:28 +0200 Subject: [PATCH 23/25] Im stupid --- .github/workflows/ci-pytorch-test-tpu.yml | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-tpu.yml b/.github/workflows/ci-pytorch-test-tpu.yml index ae6471af936bf..63eb2219fad6d 100644 --- a/.github/workflows/ci-pytorch-test-tpu.yml +++ b/.github/workflows/ci-pytorch-test-tpu.yml @@ -33,7 +33,7 @@ jobs: if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 - #timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` + timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` steps: - uses: actions/checkout@v3 @@ -102,23 +102,10 @@ jobs: kubectl logs -f $pod_name --container=train > /tmp/full_output.txt grep '' /tmp/full_output.txt # sanity check csplit /tmp/full_output.txt '//' - # REVERT ME - ls - cat xx01 + mv xx01 coverage.xml exit $status_code shell: bash - - name: Setup upterm session - uses: lhotari/action-upterm@v1 - - - name: Statistics - if: success() - run: | - mv xx01 coverage.xml - pip install coverage -q - coverage report - coverage xml - - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 if: always() From 714648002bd5acfdd7ddc0241d701f3fbc13c67d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 21 Oct 2022 01:42:44 +0200 Subject: [PATCH 24/25] Remove always() --- .github/workflows/tpu-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index a5f4657a8e565..41092eed8c401 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -109,7 +109,6 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: always() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: From 7b66c1abbbdbac90b7847b8b0c8525f27b07bc52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 21 Oct 2022 02:23:09 +0200 Subject: [PATCH 25/25] Forgot some --- .github/workflows/README.md | 2 +- README.md | 2 +- src/pytorch_lightning/README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 86fad9e9e7e83..e039559bdd71c 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -12,7 +12,7 @@ | pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | | pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests-pytorch.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | | PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | -| test-on-tpus | .github/workflows/ci-pytorch-test-tpu.yml | Run only TPU-specific tests. | TPU | +| test-on-tpus | .github/workflows/tpu-tests.yml | Run only TPU-specific tests. | TPU | - \*Accelerators used in CI diff --git a/README.md b/README.md index b64d3d6247f73..0613f102b7dde 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs | System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | | :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml) | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/tpu-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/tpu-tests.yml) | - | - | | Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | | Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | | Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index 245d6a96ec21f..93e0e68866de8 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -81,7 +81,7 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs | System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | | :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-tpu.yml) | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/tpu-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/tpu-tests.yml) | - | - | | Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | | Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | | Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) |