Skip to content

Commit c4784a3

Browse files
authored
Merge branch 'master' into codeq/mlflow-logger
2 parents 08415ee + ca1917e commit c4784a3

File tree

17 files changed

+251
-134
lines changed

17 files changed

+251
-134
lines changed

.azure/hpu-tests.yml

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,44 @@ jobs:
2020
timeoutInMinutes: "10"
2121
# how much time to give 'run always even if cancelled tasks' before stopping them
2222
cancelTimeoutInMinutes: "2"
23-
pool: intel-hpus
23+
pool: habana-gaudi-hpus
24+
container:
25+
image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest"
26+
options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro"
2427
workspace:
2528
clean: all
2629

2730
steps:
31+
- script: |
32+
/tmp/docker exec -t -u 0 cd-container \
33+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
34+
displayName: 'Install Sudo in container (thanks Microsoft!)'
35+
2836
- bash: |
29-
apt-get install -y hwinfo
37+
sudo apt-get install -y hwinfo
3038
hwinfo --short
39+
python --version
40+
sudo pip install pip -U
3141
displayName: 'Instance HW info'
3242
3343
- bash: |
34-
pip install -e .[extra] -r requirements/pytorch/test.txt
44+
set -e
45+
pip --version
46+
sudo pip uninstall -y lightning pytorch-lightning
47+
pip install fire
48+
python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
49+
pip install ".[extra,test]"
50+
pip list
3551
env:
3652
PACKAGE_NAME: pytorch
3753
FREEZE_REQUIREMENTS: 1
3854
displayName: 'Install dependencies'
3955
56+
- bash: |
57+
hl-smi -L
58+
lsmod | grep habanalabs
59+
displayName: 'Check the driver status'
60+
4061
- bash: |
4162
python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml
4263
workingDirectory: tests/tests_pytorch

.github/workflows/ci_pr-gatekeeper.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
run: |
2727
patterns = ('docs/source-${{ matrix.pkg }}', 'src/lightning_${{ matrix.pkg }}', 'tests/tests_${{ matrix.pkg }}')
2828
changed = any(p in "${{steps.changed-files.outputs.all_changed_and_modified_files}}" for p in patterns)
29-
print('::set-output name=files::' + int(changed))
29+
print(f'::set-output name=files::{int(changed)}')
3030
shell: python
3131
- uses: octodemo/pr-gatekeeper@main
3232
if: steps.touched.outputs.files == 1

.github/workflows/cicd-pytorch_dockers.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ jobs:
225225
build-args: |
226226
DIST=latest
227227
GAUDI_VERSION=${{ matrix.gaudi_version }}
228-
PYTORCH_VERSION=${{ matrix.pytorch_version }}
228+
PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }}
229229
file: dockers/ci-runner-hpu/Dockerfile
230230
push: ${{ env.PUSH_TO_HUB }}
231231
tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ repos:
7373
name: Format imports
7474

7575
- repo: https://github.com/psf/black
76-
rev: 22.3.0
76+
rev: 22.6.0
7777
hooks:
7878
- id: black
7979
name: Format code

dockers/ci-runner-hpu/Dockerfile

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,65 @@
1+
# Run command to build:
2+
# gaudi_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/" | sed -n 's/.*href="\([^"]*\).*/\1/p' | tail -2 | head -1 | sed "s/\///1")
3+
# pytorch_install_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/$gaudi_ver/ubuntu20.04/habanalabs/" | sed -n 's/.*href="\([^"]*\).*/\1/p'| sed "s/\///1" | grep pytorch-installer)
4+
# pytorch_install_ver=${pytorch_install_ver/"pytorch-installer-"/""}
5+
# docker build -t gaudi-docker-agent:latest \
6+
# --build-arg GAUDI_VERSION=$gaudi_ver \
7+
# --build-arg PYTORCH_INSTALLER_VERSION=$pytorch_install_ver \
8+
# -f Dockerfile .
9+
# Run command:
10+
# docker run --privileged \
11+
# -v /dev:/dev \
12+
# -e AZP_URL="https://dev.azure.com/ORGANIZATION/" \
13+
# -e AZP_TOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" \
14+
# -e AZP_AGENT_NAME="hpu1" \
15+
# -e AZP_POOL="intel-hpus" \
16+
# gaudi-docker-agent:latest
17+
118
ARG DIST="latest"
219
ARG GAUDI_VERSION="1.5.0"
3-
ARG PYTORCH_VERSION="1.11.0"
4-
5-
FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:${DIST}
20+
ARG PYTORCH_INSTALLER_VERSION="1.11.0"
21+
FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_INSTALLER_VERSION}:${DIST}
622

723
LABEL maintainer="https://vault.habana.ai/"
24+
# update the base packages and add a non-sudo user
25+
RUN \
26+
apt-get update -y && \
27+
apt-get upgrade -y && \
28+
useradd -m docker
829

9-
RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
10-
11-
WORKDIR /azp
12-
13-
COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
30+
# To make it easier for build and release pipelines to run apt-get,
31+
# configure apt to not require confirmation (assume the -y argument by default)
32+
ENV DEBIAN_FRONTEND=noninteractive
33+
RUN echo "APT::Get::Assume-Yes \"true\";" > /etc/apt/apt.conf.d/90assumeyes
1434

15-
RUN chmod +x /usr/local/bin/start.sh
35+
RUN apt-get update --fix-missing && \
36+
apt-get install -y --no-install-recommends \
37+
ca-certificates \
38+
build-essential \
39+
curl \
40+
jq \
41+
git \
42+
iputils-ping \
43+
libcurl4 \
44+
libunwind8 \
45+
netcat \
46+
libssl1.0 \
47+
libssl-dev \
48+
libffi-dev \
49+
python3 \
50+
python3-venv \
51+
python3-dev \
52+
python3-pip
1653

1754
RUN curl -fsSL https://get.docker.com -o get-docker.sh && \
1855
sh get-docker.sh && \
1956
rm get-docker.sh
2057

21-
#RUN docker --help
58+
RUN pip uninstall pytorch-lightning -y
59+
60+
WORKDIR /azp
61+
62+
COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
63+
RUN chmod +x /usr/local/bin/start.sh
2264

2365
ENTRYPOINT ["/usr/local/bin/start.sh"]
24-
CMD ["bash"]

dockers/ci-runner-hpu/start.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,4 @@ trap 'cleanup; exit 143' TERM
9393

9494
# To be aware of TERM and INT signals call run.sh
9595
# Running it with the --once flag at the end will shut down the agent after the build is executed
96-
./run.sh --once & wait $!
96+
./run.sh & wait $!

requirements/pytorch/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ tensorboard>=2.9.1, <2.10.0
77
torchmetrics>=0.7.0, <0.9.2 # needed for using fixed compare_version
88
pyDeprecate>=0.3.1, <=0.3.2
99
packaging>=17.0, <=21.3
10-
typing-extensions>=4.0.0, <4.2.1
10+
typing-extensions>=4.0.0, <4.3.1

src/lightning/__setup__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def _adjust_manifest(**kwargs: Any) -> None:
3939
lines += [
4040
"recursive-include src *.md" + os.linesep,
4141
"recursive-include requirements *.txt" + os.linesep,
42+
"recursive-include src/lightning_app/cli/*-template *" + os.linesep, # Add templates
4243
]
4344
with open(manifest_path, "w") as fp:
4445
fp.writelines(lines)

src/lightning_app/__setup__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def _adjust_manifest(**__: Any) -> None:
5050
"recursive-exclude requirements *.txt" + os.linesep,
5151
"recursive-include src/lightning_app *.md" + os.linesep,
5252
"recursive-include requirements/app *.txt" + os.linesep,
53+
"recursive-include src/lightning_app/cli/*-template *" + os.linesep, # Add templates
5354
]
5455

5556
# TODO: remove this once lightning-ui package is ready as a dependency

src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,19 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from typing import Any
14+
from typing import Any, Optional
15+
16+
import torch
1517

1618
from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
19+
from pytorch_lightning.utilities.enums import PrecisionType
1720
from pytorch_lightning.utilities.exceptions import MisconfigurationException
21+
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
22+
23+
if _TORCH_GREATER_EQUAL_1_12:
24+
from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
25+
else:
26+
MixedPrecision = None
1827

1928

2029
class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin):
@@ -29,3 +38,18 @@ def clip_grad_by_norm(self, *_: Any, **__: Any) -> None:
2938
raise MisconfigurationException(
3039
f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`"
3140
)
41+
42+
@property
43+
def mixed_precision_config(self) -> Optional[MixedPrecision]:
44+
assert MixedPrecision is not None
45+
if self.precision == PrecisionType.HALF:
46+
dtype = torch.float16
47+
elif self.precision == PrecisionType.BFLOAT:
48+
dtype = torch.bfloat16
49+
else:
50+
raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.")
51+
return MixedPrecision(
52+
param_dtype=dtype,
53+
reduce_dtype=dtype,
54+
buffer_dtype=dtype,
55+
)

0 commit comments

Comments
 (0)