From 90433fc435e86f290a171b267be4385947db9ebb Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jun 2022 18:22:00 +0200 Subject: [PATCH 01/26] pip list --- .azure/hpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index a3041ce32daae..d841e6f4286fa 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -31,6 +31,7 @@ jobs: - bash: | pip install -e .[extra] -r requirements/pytorch/test.txt + pip list env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 From 681c0e7e15423629a6c5ca9e1f566d3d6fb34944 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 28 Jun 2022 23:56:22 +0200 Subject: [PATCH 02/26] ver --- .azure/hpu-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index d841e6f4286fa..7455530b28f06 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -27,10 +27,12 @@ jobs: - bash: | apt-get install -y hwinfo hwinfo --short + python --version displayName: 'Instance HW info' - bash: | - pip install -e .[extra] -r requirements/pytorch/test.txt + pip --version + pip install -e ".[extra]" -r requirements/pytorch/test.txt pip list env: PACKAGE_NAME: pytorch From 3d625c90293294aaf795d9751fdf2ed39732efa0 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 29 Jun 2022 00:01:51 +0200 Subject: [PATCH 03/26] ver --- .azure/hpu-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 7455530b28f06..2554b848d3d4d 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -28,11 +28,12 @@ jobs: apt-get install -y hwinfo hwinfo --short python --version + pip install pip -U displayName: 'Instance HW info' - bash: | pip --version - pip install -e ".[extra]" -r requirements/pytorch/test.txt + pip install -e ".[extra]" -u -r requirements/pytorch/test.txt pip list env: PACKAGE_NAME: pytorch From a504900d75b76852ba4c3b6b93691435eb7da46b Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 29 Jun 2022 00:03:52 +0200 Subject: [PATCH 04/26] ver --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 2554b848d3d4d..73a756b33324d 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -33,7 +33,7 @@ jobs: - bash: | pip --version - pip install -e ".[extra]" -u -r requirements/pytorch/test.txt + pip install -e ".[extra,test]" pip list env: PACKAGE_NAME: pytorch From e2ea4e4e84005ba9dbffcce338657237842b03ea Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 29 Jun 2022 00:08:58 +0200 Subject: [PATCH 05/26] prune --- .azure/hpu-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 73a756b33324d..4327fb768c40d 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -33,6 +33,7 @@ jobs: - bash: | pip --version + .actions/assistant.py requirements_prune_pkgs torch,torchvision,torchtext pip install -e ".[extra,test]" pip list env: From 597f049f413919b975fa92750b98e2cf35c0b0fa Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 29 Jun 2022 00:15:13 +0200 Subject: [PATCH 06/26] user --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 4327fb768c40d..393be77635efb 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -34,7 +34,7 @@ jobs: - bash: | pip --version .actions/assistant.py requirements_prune_pkgs torch,torchvision,torchtext - pip install -e ".[extra,test]" + pip install -e ".[extra,test]" --user pip list env: PACKAGE_NAME: pytorch From 65f16a7da16b6c06cb3050184cac08d642f6c9f6 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 30 Jun 2022 17:56:35 +0200 Subject: [PATCH 07/26] find --- .azure/hpu-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 393be77635efb..234389cedc5d9 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -36,6 +36,8 @@ jobs: .actions/assistant.py requirements_prune_pkgs torch,torchvision,torchtext pip install -e ".[extra,test]" --user pip list + find / -name "dist-packages" + find / -name "site-packages" env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 From 48fa044fa530e2eb881b3d53fc107095e94fddb2 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 30 Jun 2022 18:00:54 +0200 Subject: [PATCH 08/26] ls --- .azure/hpu-tests.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 234389cedc5d9..963070a9148d4 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -38,6 +38,11 @@ jobs: pip list find / -name "dist-packages" find / -name "site-packages" + ls -lh /usr/local/lib/python3.8/dist-packages | grep lightning + ls -lh /usr/lib/python3/dist-packages | grep lightning + ls -lh /usr/lib/python2.7/dist-packages | grep lightning + ls -lh /usr/lib/python3.8/site-packages | grep lightning + ls -lh /root/.local/lib/python3.8/site-packages | grep lightning env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 From fcb6021af9b09ef9fcfae512adcfaea252296f63 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 30 Jun 2022 18:09:22 +0200 Subject: [PATCH 09/26] pypi --- .azure/hpu-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 963070a9148d4..ffeb872b03a5a 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -43,6 +43,9 @@ jobs: ls -lh /usr/lib/python2.7/dist-packages | grep lightning ls -lh /usr/lib/python3.8/site-packages | grep lightning ls -lh /root/.local/lib/python3.8/site-packages | grep lightning + # fixme + pip install pytorch-lightning + pip list | grep lightning env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 From ac580353cb480caf27cdd04c8be8b06490f7f104 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 30 Jun 2022 18:48:19 +0200 Subject: [PATCH 10/26] . --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index ffeb872b03a5a..57ab52d5b49fd 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -34,7 +34,7 @@ jobs: - bash: | pip --version .actions/assistant.py requirements_prune_pkgs torch,torchvision,torchtext - pip install -e ".[extra,test]" --user + pip install .["extra","test"] --user pip list find / -name "dist-packages" find / -name "site-packages" From f6e74c103630b2fa9891303db27555d22d270836 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 30 Jun 2022 19:12:02 +0200 Subject: [PATCH 11/26] force --- .azure/hpu-tests.yml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 57ab52d5b49fd..8b5464ecf95cb 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -34,18 +34,8 @@ jobs: - bash: | pip --version .actions/assistant.py requirements_prune_pkgs torch,torchvision,torchtext - pip install .["extra","test"] --user + pip install .["extra","test"] --force-reinstall pip list - find / -name "dist-packages" - find / -name "site-packages" - ls -lh /usr/local/lib/python3.8/dist-packages | grep lightning - ls -lh /usr/lib/python3/dist-packages | grep lightning - ls -lh /usr/lib/python2.7/dist-packages | grep lightning - ls -lh /usr/lib/python3.8/site-packages | grep lightning - ls -lh /root/.local/lib/python3.8/site-packages | grep lightning - # fixme - pip install pytorch-lightning - pip list | grep lightning env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 From 2d8814089699daf605a8e5530f33c7baf05bb189 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 30 Jun 2022 19:15:22 +0200 Subject: [PATCH 12/26] uninstall --- .azure/hpu-tests.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 8b5464ecf95cb..60f244d6c818d 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -33,8 +33,9 @@ jobs: - bash: | pip --version - .actions/assistant.py requirements_prune_pkgs torch,torchvision,torchtext - pip install .["extra","test"] --force-reinstall + pip uninstall -y pytorch-lightning + .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext + pip install .["extra","test"] pip list env: PACKAGE_NAME: pytorch From 76f6cfbcd3bde7b7adc73f851249e823a2545494 Mon Sep 17 00:00:00 2001 From: arao Date: Fri, 24 Jun 2022 06:00:36 +0300 Subject: [PATCH 13/26] Update the hpu-tests.yml to pull docker from vault --- .azure/hpu-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 60f244d6c818d..a1695adcf8bd7 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -20,6 +20,8 @@ jobs: # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" pool: intel-hpus + container: + image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" workspace: clean: all From 8ff238c47918690405b99ddfeda2826341fbbad9 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 24 Jun 2022 15:17:25 +0200 Subject: [PATCH 14/26] runner --- .github/workflows/ci-pytorch_dockers.yml | 2 +- .github/workflows/events-nightly.yml | 4 +- dockers/ci-runner-hpu/Dockerfile | 59 ++++++++++++++++++------ dockers/ci-runner-hpu/start.sh | 2 +- 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci-pytorch_dockers.yml b/.github/workflows/ci-pytorch_dockers.yml index 69d5955c5db33..366f0ea45c86d 100644 --- a/.github/workflows/ci-pytorch_dockers.yml +++ b/.github/workflows/ci-pytorch_dockers.yml @@ -177,7 +177,7 @@ jobs: build-args: | DIST=latest GAUDI_VERSION=${{ matrix.gaudi_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} + PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }} file: dockers/ci-runner-hpu/Dockerfile push: false timeout-minutes: 60 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 0325671413dbb..afbd18c778663 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -297,10 +297,10 @@ jobs: build-args: | DIST=latest GAUDI_VERSION=${{ matrix.gaudi_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} + PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }} file: dockers/ci-runner-hpu/Dockerfile push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }} + tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi timeout-minutes: 55 # report failure to Slack diff --git a/dockers/ci-runner-hpu/Dockerfile b/dockers/ci-runner-hpu/Dockerfile index c4e37a5e2b41b..444fb0e4e01e3 100644 --- a/dockers/ci-runner-hpu/Dockerfile +++ b/dockers/ci-runner-hpu/Dockerfile @@ -1,24 +1,57 @@ +# Run command to build: +# gaudi_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/" | sed -n 's/.*href="\([^"]*\).*/\1/p' | tail -2 | head -1 | sed "s/\///1") +# pytorch_install_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/$gaudi_ver/ubuntu20.04/habanalabs/" | sed -n 's/.*href="\([^"]*\).*/\1/p'| sed "s/\///1" | grep pytorch-installer) +# pytorch_install_ver=${pytorch_install_ver/"pytorch-installer-"/""} +# docker build -t gaudi-docker-agent:latest \ +# --build-arg GAUDI_VERSION=$gaudi_ver \ +# --build-arg PYTORCH_INSTALLER_VERSION=$pytorch_install_ver \ +# -f Dockerfile . +# Run command: +# docker run --privileged \ +# -v /dev:/dev \ +# -e AZP_URL="https://dev.azure.com/ORGANIZATION/" \ +# -e AZP_TOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" \ +# -e AZP_AGENT_NAME="hpu1" \ +# -e AZP_POOL="intel-hpus" \ +# gaudi-docker-agent:latest + ARG DIST="latest" ARG GAUDI_VERSION="1.5.0" -ARG PYTORCH_VERSION="1.11.0" - -FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:${DIST} +ARG PYTORCH_INSTALLER_VERSION="1.11.0" +FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_INSTALLER_VERSION}:${DIST} LABEL maintainer="https://vault.habana.ai/" -RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers - -WORKDIR /azp - -COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ - -RUN chmod +x /usr/local/bin/start.sh +RUN pip uninstall pytorch-lightning -y +# update the base packages and add a non-sudo user +RUN apt-get update -y && apt-get upgrade -y && useradd -m docker +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + curl jq build-essential libssl-dev libffi-dev python3 python3-venv python3-dev python3-pip + +# To make it easier for build and release pipelines to run apt-get, +# configure apt to not require confirmation (assume the -y argument by default) +ENV DEBIAN_FRONTEND=noninteractive +RUN echo "APT::Get::Assume-Yes \"true\";" > /etc/apt/apt.conf.d/90assumeyes + +RUN apt-get update --fix-missing && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + jq \ + git \ + iputils-ping \ + libcurl4 \ + libunwind8 \ + netcat \ + libssl1.0 RUN curl -fsSL https://get.docker.com -o get-docker.sh && \ sh get-docker.sh && \ rm get-docker.sh -#RUN docker --help +WORKDIR /azp + +COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/start.sh -ENTRYPOINT ["/usr/local/bin/start.sh"] -CMD ["bash"] +ENTRYPOINT ["/usr/local/bin/start.sh"] \ No newline at end of file diff --git a/dockers/ci-runner-hpu/start.sh b/dockers/ci-runner-hpu/start.sh index caa452b978c18..ac0405fd9cfde 100644 --- a/dockers/ci-runner-hpu/start.sh +++ b/dockers/ci-runner-hpu/start.sh @@ -93,4 +93,4 @@ trap 'cleanup; exit 143' TERM # To be aware of TERM and INT signals call run.sh # Running it with the --once flag at the end will shut down the agent after the build is executed -./run.sh --once & wait $! +./run.sh & wait $! \ No newline at end of file From 8a9859f42b09c89156a169b55cfbc48907edc119 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Jun 2022 13:20:59 +0000 Subject: [PATCH 15/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dockers/ci-runner-hpu/Dockerfile | 2 +- dockers/ci-runner-hpu/start.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dockers/ci-runner-hpu/Dockerfile b/dockers/ci-runner-hpu/Dockerfile index 444fb0e4e01e3..82147b06f481b 100644 --- a/dockers/ci-runner-hpu/Dockerfile +++ b/dockers/ci-runner-hpu/Dockerfile @@ -54,4 +54,4 @@ WORKDIR /azp COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ RUN chmod +x /usr/local/bin/start.sh -ENTRYPOINT ["/usr/local/bin/start.sh"] \ No newline at end of file +ENTRYPOINT ["/usr/local/bin/start.sh"] diff --git a/dockers/ci-runner-hpu/start.sh b/dockers/ci-runner-hpu/start.sh index ac0405fd9cfde..82472a817ab94 100644 --- a/dockers/ci-runner-hpu/start.sh +++ b/dockers/ci-runner-hpu/start.sh @@ -93,4 +93,4 @@ trap 'cleanup; exit 143' TERM # To be aware of TERM and INT signals call run.sh # Running it with the --once flag at the end will shut down the agent after the build is executed -./run.sh & wait $! \ No newline at end of file +./run.sh & wait $! From a7968ab711bc62ff12120a561c77d608fe0bf8ac Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 00:07:40 +0200 Subject: [PATCH 16/26] prune --- dockers/ci-runner-hpu/Dockerfile | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/dockers/ci-runner-hpu/Dockerfile b/dockers/ci-runner-hpu/Dockerfile index 82147b06f481b..588d23702e9ff 100644 --- a/dockers/ci-runner-hpu/Dockerfile +++ b/dockers/ci-runner-hpu/Dockerfile @@ -21,12 +21,11 @@ ARG PYTORCH_INSTALLER_VERSION="1.11.0" FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_INSTALLER_VERSION}:${DIST} LABEL maintainer="https://vault.habana.ai/" - -RUN pip uninstall pytorch-lightning -y # update the base packages and add a non-sudo user -RUN apt-get update -y && apt-get upgrade -y && useradd -m docker -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - curl jq build-essential libssl-dev libffi-dev python3 python3-venv python3-dev python3-pip +RUN \ + apt-get update -y && \ + apt-get upgrade -y && \ + useradd -m docker # To make it easier for build and release pipelines to run apt-get, # configure apt to not require confirmation (assume the -y argument by default) @@ -36,6 +35,7 @@ RUN echo "APT::Get::Assume-Yes \"true\";" > /etc/apt/apt.conf.d/90assumeyes RUN apt-get update --fix-missing && \ apt-get install -y --no-install-recommends \ ca-certificates \ + build-essential \ curl \ jq \ git \ @@ -43,12 +43,20 @@ RUN apt-get update --fix-missing && \ libcurl4 \ libunwind8 \ netcat \ - libssl1.0 + libssl1.0 \ + libssl-dev \ + libffi-dev \ + python3 \ + python3-venv \ + python3-dev \ + python3-pip RUN curl -fsSL https://get.docker.com -o get-docker.sh && \ sh get-docker.sh && \ rm get-docker.sh +RUN pip uninstall pytorch-lightning -y + WORKDIR /azp COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ From c053c0399adfaa7e0dda6018e8bc5eb941ddbb2a Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 00:52:04 +0200 Subject: [PATCH 17/26] sudo --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index a1695adcf8bd7..7b66fd827afe1 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -27,7 +27,7 @@ jobs: steps: - bash: | - apt-get install -y hwinfo + sudo apt-get install -y hwinfo hwinfo --short python --version pip install pip -U From 434bc8a88f9e79ea6f064c897f543f54cbe748c0 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 01:04:08 +0200 Subject: [PATCH 18/26] sudo --- .azure/hpu-tests.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 7b66fd827afe1..278c1ec02f73c 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -22,10 +22,16 @@ jobs: pool: intel-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" + options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all steps: + - script: | + /tmp/docker exec -t -u 0 cd-container \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + - bash: | sudo apt-get install -y hwinfo hwinfo --short From a48920e25f1a54ce446db4dca508ffc62f1338ee Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 01:15:56 +0200 Subject: [PATCH 19/26] sudo --- .azure/hpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 60f244d6c818d..698a5cdf4fcfa 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -28,12 +28,12 @@ jobs: apt-get install -y hwinfo hwinfo --short python --version - pip install pip -U + sudo pip install pip -U displayName: 'Instance HW info' - bash: | pip --version - pip uninstall -y pytorch-lightning + pip uninstall -y lightning pytorch-lightning .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext pip install .["extra","test"] pip list From c528248180477ad853e49dd62f482d3a793ade9a Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 01:34:04 +0200 Subject: [PATCH 20/26] fire & sudo --- .azure/hpu-tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index bb471db9a7676..766a8bf995320 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -40,9 +40,11 @@ jobs: displayName: 'Instance HW info' - bash: | + set -e pip --version - pip uninstall -y lightning pytorch-lightning - .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext + sudo pip uninstall -y lightning pytorch-lightning + pip install fire + python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext pip install .["extra","test"] pip list env: From f0d719b8eed0751540b0538bb67851845d9b7ef3 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 12:22:37 +0200 Subject: [PATCH 21/26] habana-gaudi-hpus --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 766a8bf995320..338c4624b3711 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -19,7 +19,7 @@ jobs: timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: intel-hpus + pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" From 4fd076f651b54d4a8944334d8143054657433f34 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 12:35:28 +0200 Subject: [PATCH 22/26] collision --- .azure/hpu-tests.yml | 2 +- .github/workflows/cicd-pytorch_dockers.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 338c4624b3711..16384e8a62ef6 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -45,7 +45,7 @@ jobs: sudo pip uninstall -y lightning pytorch-lightning pip install fire python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext - pip install .["extra","test"] + pip install ".[extra,test]" pip list env: PACKAGE_NAME: pytorch diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch_dockers.yml index 317d005bd89f1..ca40357380400 100644 --- a/.github/workflows/cicd-pytorch_dockers.yml +++ b/.github/workflows/cicd-pytorch_dockers.yml @@ -226,7 +226,7 @@ jobs: build-args: | DIST=latest GAUDI_VERSION=${{ matrix.gaudi_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} + PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }} file: dockers/ci-runner-hpu/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }} From 6c731e00b8c74f235748741c3247c0de7b118efe Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 15 Jul 2022 23:28:25 +0200 Subject: [PATCH 23/26] ci --- .azure/hpu-tests.yml | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 70e7e11a551e5..8a0b529baa9b3 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -14,10 +14,6 @@ pr: - "master" - "release/*" -variables: - - name: continue - value: '1' - jobs: - job: testing # how long to run the job before automatically cancelling @@ -32,26 +28,11 @@ jobs: clean: all steps: - - bash: | - CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo $CHANGED_FILES > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "##vso[task.setvariable variable=continue]0" - else - echo "Continue" - echo "##vso[task.setvariable variable=continue]1" - fi - displayName: Skipper - script: | /tmp/docker exec -t -u 0 cd-container \ sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" displayName: 'Install Sudo in container (thanks Microsoft!)' - condition: eq(variables['continue'], '1') - bash: | sudo apt-get install -y hwinfo @@ -59,7 +40,6 @@ jobs: python --version sudo pip install pip -U displayName: 'Instance HW info' - condition: eq(variables['continue'], '1') - bash: | set -e @@ -73,19 +53,16 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' - condition: eq(variables['continue'], '1') - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Single card HPU test' - condition: eq(variables['continue'], '1') - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --hpus 8 --junitxml=hpu8_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'Multi card(8) HPU test' - condition: eq(variables['continue'], '1') - bash: | python -m pytest -sv plugins/precision/hpu/test_hpu.py --hmp-bf16 \ @@ -94,14 +71,12 @@ jobs: --junitxml=hpu1_precision_test-results.xml workingDirectory: tests/tests_pytorch displayName: 'HPU precision test' - condition: eq(variables['continue'], '1') - bash: | export PYTHONPATH="${PYTHONPATH}:$(pwd)" python "pl_hpu/mnist_sample.py" workingDirectory: examples displayName: 'Testing: HPU examples' - condition: eq(variables['continue'], '1') - task: PublishTestResults@2 inputs: From 3630c59f73d78744c4793f8a6cd6a017e484fbb2 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 19 Jul 2022 22:44:54 +0200 Subject: [PATCH 24/26] params --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index cfe5a3b6f2d7b..0fa04920ec7cc 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -23,7 +23,7 @@ jobs: pool: habana-gaudi-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" - options: "--shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all From 3e5da238371307e2c4a842bb181a81aa15e42e9a Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 19 Jul 2022 13:50:13 -0700 Subject: [PATCH 25/26] Check the driver status on gaudi server (#13718) --- .azure/hpu-tests.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 0fa04920ec7cc..f0b279bda3f60 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -53,6 +53,11 @@ jobs: FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' + - bash: | + hl-smi -L + lsmod | grep habanalabs + displayName: 'Check the driver status' + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch From 89a408e5a0fd8c19a6e206db9d46cb7283b3c3c7 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 20 Jul 2022 05:27:39 +0200 Subject: [PATCH 26/26] gk --- .github/workflows/ci_pr-gatekeeper.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci_pr-gatekeeper.yml index d76801fd529a5..92215edd3c107 100644 --- a/.github/workflows/ci_pr-gatekeeper.yml +++ b/.github/workflows/ci_pr-gatekeeper.yml @@ -26,7 +26,7 @@ jobs: run: | patterns = ('docs/source-${{ matrix.pkg }}', 'src/lightning_${{ matrix.pkg }}', 'tests/tests_${{ matrix.pkg }}') changed = any(p in "${{steps.changed-files.outputs.all_changed_and_modified_files}}" for p in patterns) - print('::set-output name=files::' + int(changed)) + print(f'::set-output name=files::{int(changed)}') shell: python - uses: octodemo/pr-gatekeeper@main if: steps.touched.outputs.files == 1