diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index e530ace901bfa..f0b279bda3f60 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -20,23 +20,44 @@ jobs: timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: intel-hpus + pool: habana-gaudi-hpus + container: + image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all steps: + - script: | + /tmp/docker exec -t -u 0 cd-container \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + - bash: | - apt-get install -y hwinfo + sudo apt-get install -y hwinfo hwinfo --short + python --version + sudo pip install pip -U displayName: 'Instance HW info' - bash: | - pip install -e .[extra] -r requirements/pytorch/test.txt + set -e + pip --version + sudo pip uninstall -y lightning pytorch-lightning + pip install fire + python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext + pip install ".[extra,test]" + pip list env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' + - bash: | + hl-smi -L + lsmod | grep habanalabs + displayName: 'Check the driver status' + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci_pr-gatekeeper.yml index d76801fd529a5..92215edd3c107 100644 --- a/.github/workflows/ci_pr-gatekeeper.yml +++ b/.github/workflows/ci_pr-gatekeeper.yml @@ -26,7 +26,7 @@ jobs: run: | patterns = ('docs/source-${{ matrix.pkg }}', 'src/lightning_${{ matrix.pkg }}', 'tests/tests_${{ matrix.pkg }}') changed = any(p in "${{steps.changed-files.outputs.all_changed_and_modified_files}}" for p in patterns) - print('::set-output name=files::' + int(changed)) + print(f'::set-output name=files::{int(changed)}') shell: python - uses: octodemo/pr-gatekeeper@main if: steps.touched.outputs.files == 1 diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch_dockers.yml index 3d7bb6fc363e9..4742f3579c274 100644 --- a/.github/workflows/cicd-pytorch_dockers.yml +++ b/.github/workflows/cicd-pytorch_dockers.yml @@ -225,7 +225,7 @@ jobs: build-args: | DIST=latest GAUDI_VERSION=${{ matrix.gaudi_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} + PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }} file: dockers/ci-runner-hpu/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }} diff --git a/dockers/ci-runner-hpu/Dockerfile b/dockers/ci-runner-hpu/Dockerfile index c4e37a5e2b41b..588d23702e9ff 100644 --- a/dockers/ci-runner-hpu/Dockerfile +++ b/dockers/ci-runner-hpu/Dockerfile @@ -1,24 +1,65 @@ +# Run command to build: +# gaudi_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/" | sed -n 's/.*href="\([^"]*\).*/\1/p' | tail -2 | head -1 | sed "s/\///1") +# pytorch_install_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/$gaudi_ver/ubuntu20.04/habanalabs/" | sed -n 's/.*href="\([^"]*\).*/\1/p'| sed "s/\///1" | grep pytorch-installer) +# pytorch_install_ver=${pytorch_install_ver/"pytorch-installer-"/""} +# docker build -t gaudi-docker-agent:latest \ +# --build-arg GAUDI_VERSION=$gaudi_ver \ +# --build-arg PYTORCH_INSTALLER_VERSION=$pytorch_install_ver \ +# -f Dockerfile . +# Run command: +# docker run --privileged \ +# -v /dev:/dev \ +# -e AZP_URL="https://dev.azure.com/ORGANIZATION/" \ +# -e AZP_TOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" \ +# -e AZP_AGENT_NAME="hpu1" \ +# -e AZP_POOL="intel-hpus" \ +# gaudi-docker-agent:latest + ARG DIST="latest" ARG GAUDI_VERSION="1.5.0" -ARG PYTORCH_VERSION="1.11.0" - -FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:${DIST} +ARG PYTORCH_INSTALLER_VERSION="1.11.0" +FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_INSTALLER_VERSION}:${DIST} LABEL maintainer="https://vault.habana.ai/" +# update the base packages and add a non-sudo user +RUN \ + apt-get update -y && \ + apt-get upgrade -y && \ + useradd -m docker -RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers - -WORKDIR /azp - -COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ +# To make it easier for build and release pipelines to run apt-get, +# configure apt to not require confirmation (assume the -y argument by default) +ENV DEBIAN_FRONTEND=noninteractive +RUN echo "APT::Get::Assume-Yes \"true\";" > /etc/apt/apt.conf.d/90assumeyes -RUN chmod +x /usr/local/bin/start.sh +RUN apt-get update --fix-missing && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + build-essential \ + curl \ + jq \ + git \ + iputils-ping \ + libcurl4 \ + libunwind8 \ + netcat \ + libssl1.0 \ + libssl-dev \ + libffi-dev \ + python3 \ + python3-venv \ + python3-dev \ + python3-pip RUN curl -fsSL https://get.docker.com -o get-docker.sh && \ sh get-docker.sh && \ rm get-docker.sh -#RUN docker --help +RUN pip uninstall pytorch-lightning -y + +WORKDIR /azp + +COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/start.sh ENTRYPOINT ["/usr/local/bin/start.sh"] -CMD ["bash"] diff --git a/dockers/ci-runner-hpu/start.sh b/dockers/ci-runner-hpu/start.sh index caa452b978c18..82472a817ab94 100644 --- a/dockers/ci-runner-hpu/start.sh +++ b/dockers/ci-runner-hpu/start.sh @@ -93,4 +93,4 @@ trap 'cleanup; exit 143' TERM # To be aware of TERM and INT signals call run.sh # Running it with the --once flag at the end will shut down the agent after the build is executed -./run.sh --once & wait $! +./run.sh & wait $!