Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
90433fc
pip list
Borda Jun 27, 2022
681c0e7
ver
Borda Jun 28, 2022
3d625c9
ver
Borda Jun 28, 2022
a504900
ver
Borda Jun 28, 2022
e2ea4e4
prune
Borda Jun 28, 2022
597f049
user
Borda Jun 28, 2022
65f16a7
find
Borda Jun 30, 2022
48fa044
ls
Borda Jun 30, 2022
fcb6021
pypi
Borda Jun 30, 2022
ac58035
.
Borda Jun 30, 2022
f6e74c1
force
Borda Jun 30, 2022
2d88140
uninstall
Borda Jun 30, 2022
76f6cfb
Update the hpu-tests.yml to pull docker from vault
raoakarsha Jun 24, 2022
8ff238c
runner
Borda Jun 24, 2022
8a9859f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 24, 2022
a7968ab
prune
Borda Jul 11, 2022
c053c03
sudo
Borda Jul 11, 2022
434bc8a
sudo
Borda Jul 11, 2022
a48920e
sudo
Borda Jul 11, 2022
9b6205c
Merge branch 'ci/hpu2' of https://github.com/PyTorchLightning/pytorch…
Borda Jul 11, 2022
c528248
fire & sudo
Borda Jul 11, 2022
f0d719b
habana-gaudi-hpus
Borda Jul 12, 2022
19c6b68
Merge branch 'master' into ci/hpu2
Borda Jul 12, 2022
4fd076f
collision
Borda Jul 12, 2022
dfa5720
Merge branch 'master' into ci/hpu2
Borda Jul 15, 2022
318ca08
Merge branch 'master' into ci/hpu2
Borda Jul 15, 2022
6c731e0
ci
Borda Jul 15, 2022
8d60792
Merge branch 'master' into ci/hpu2
Borda Jul 15, 2022
b13f6bb
Merge branch 'master' into ci/hpu2
Borda Jul 19, 2022
3630c59
params
Borda Jul 19, 2022
7585207
Merge branch 'master' into ci/hpu2
Borda Jul 19, 2022
3e5da23
Check the driver status on gaudi server (#13718)
raoakarsha Jul 19, 2022
89a408e
gk
Borda Jul 20, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions .azure/hpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,44 @@ jobs:
timeoutInMinutes: "10"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: intel-hpus
pool: habana-gaudi-hpus
container:
image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest"
options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro"
workspace:
clean: all

steps:
- script: |
/tmp/docker exec -t -u 0 cd-container \
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
displayName: 'Install Sudo in container (thanks Microsoft!)'

- bash: |
apt-get install -y hwinfo
sudo apt-get install -y hwinfo
hwinfo --short
python --version
sudo pip install pip -U
displayName: 'Instance HW info'

- bash: |
pip install -e .[extra] -r requirements/pytorch/test.txt
set -e
pip --version
sudo pip uninstall -y lightning pytorch-lightning
pip install fire
python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
pip install ".[extra,test]"
pip list
env:
PACKAGE_NAME: pytorch
FREEZE_REQUIREMENTS: 1
displayName: 'Install dependencies'

- bash: |
hl-smi -L
lsmod | grep habanalabs
displayName: 'Check the driver status'

- bash: |
python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml
workingDirectory: tests/tests_pytorch
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_pr-gatekeeper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
run: |
patterns = ('docs/source-${{ matrix.pkg }}', 'src/lightning_${{ matrix.pkg }}', 'tests/tests_${{ matrix.pkg }}')
changed = any(p in "${{steps.changed-files.outputs.all_changed_and_modified_files}}" for p in patterns)
print('::set-output name=files::' + int(changed))
print(f'::set-output name=files::{int(changed)}')
shell: python
- uses: octodemo/pr-gatekeeper@main
if: steps.touched.outputs.files == 1
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cicd-pytorch_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ jobs:
build-args: |
DIST=latest
GAUDI_VERSION=${{ matrix.gaudi_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }}
file: dockers/ci-runner-hpu/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
Expand Down
63 changes: 52 additions & 11 deletions dockers/ci-runner-hpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,24 +1,65 @@
# Run command to build:
# gaudi_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/" | sed -n 's/.*href="\([^"]*\).*/\1/p' | tail -2 | head -1 | sed "s/\///1")
# pytorch_install_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/$gaudi_ver/ubuntu20.04/habanalabs/" | sed -n 's/.*href="\([^"]*\).*/\1/p'| sed "s/\///1" | grep pytorch-installer)
# pytorch_install_ver=${pytorch_install_ver/"pytorch-installer-"/""}
# docker build -t gaudi-docker-agent:latest \
# --build-arg GAUDI_VERSION=$gaudi_ver \
# --build-arg PYTORCH_INSTALLER_VERSION=$pytorch_install_ver \
# -f Dockerfile .
# Run command:
# docker run --privileged \
# -v /dev:/dev \
# -e AZP_URL="https://dev.azure.com/ORGANIZATION/" \
# -e AZP_TOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" \
# -e AZP_AGENT_NAME="hpu1" \
# -e AZP_POOL="intel-hpus" \
# gaudi-docker-agent:latest

ARG DIST="latest"
ARG GAUDI_VERSION="1.5.0"
ARG PYTORCH_VERSION="1.11.0"

FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:${DIST}
ARG PYTORCH_INSTALLER_VERSION="1.11.0"
FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_INSTALLER_VERSION}:${DIST}

LABEL maintainer="https://vault.habana.ai/"
# update the base packages and add a non-sudo user
RUN \
apt-get update -y && \
apt-get upgrade -y && \
useradd -m docker

RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

WORKDIR /azp

COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
# To make it easier for build and release pipelines to run apt-get,
# configure apt to not require confirmation (assume the -y argument by default)
ENV DEBIAN_FRONTEND=noninteractive
RUN echo "APT::Get::Assume-Yes \"true\";" > /etc/apt/apt.conf.d/90assumeyes

RUN chmod +x /usr/local/bin/start.sh
RUN apt-get update --fix-missing && \
apt-get install -y --no-install-recommends \
ca-certificates \
build-essential \
curl \
jq \
git \
iputils-ping \
libcurl4 \
libunwind8 \
netcat \
libssl1.0 \
libssl-dev \
libffi-dev \
python3 \
python3-venv \
python3-dev \
python3-pip

RUN curl -fsSL https://get.docker.com -o get-docker.sh && \
sh get-docker.sh && \
rm get-docker.sh

#RUN docker --help
RUN pip uninstall pytorch-lightning -y

WORKDIR /azp

COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/start.sh

ENTRYPOINT ["/usr/local/bin/start.sh"]
CMD ["bash"]
2 changes: 1 addition & 1 deletion dockers/ci-runner-hpu/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,4 @@ trap 'cleanup; exit 143' TERM

# To be aware of TERM and INT signals call run.sh
# Running it with the --once flag at the end will shut down the agent after the build is executed
./run.sh --once & wait $!
./run.sh & wait $!