Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/ci_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran
- ".github/workflows/events-nightly.yml"
- "setup.py"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
# FIXME
# concurrency:
# group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
# cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}

jobs:
build-PL:
Expand All @@ -43,7 +44,7 @@ jobs:
PYTORCH_VERSION=${{ matrix.pytorch_version }}
file: dockers/release/Dockerfile
push: false
timeout-minutes: 50
timeout-minutes: 70 # FIXME

build-XLA:
runs-on: ubuntu-20.04
Expand Down
35 changes: 7 additions & 28 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -113,38 +113,17 @@ RUN \
export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
cat ./requirements/horovod.txt && \
cmake --version && \
pip install --no-cache-dir -r ./requirements/horovod.txt && \
pip install --no-cache-dir "horovod==0.24.1" && \
PYTHONUNBUFFERED=1 horovodrun --check-build && \
# pip uninstall -y horovod && \
# pip install --no-cache-dir -r ./requirements/horovod.txt && \
# PYTHONUNBUFFERED=1 horovodrun --check-build && \
python -c "from horovod.torch import nccl_built; nccl_built()" && \
rm -rf requirements/

RUN \
CUDA_VERSION_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])") && \
py_ver=$(python -c "print(int('$PYTHON_VERSION'.split('.') >= '3.9'.split('.')))") && \
# install DALI, needed for examples
# todo: waiting for 1.4 - https://github.com/NVIDIA/DALI/issues/3144#issuecomment-877386691
if [ $py_ver -eq "0" ]; then \
pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda${CUDA_VERSION_MAJOR}0>1.0" ; \
python -c 'from nvidia.dali.pipeline import Pipeline' ; \
fi

RUN \
# install NVIDIA apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
python -c "from apex import amp"

RUN \
# install FairScale
pip install fairscale==0.4.0 && \
python -c "import fairscale; print(fairscale.__version__)"

RUN \
# install DeepSpeed
pip install deepspeed==0.5.7 && \
python -c "import deepspeed; print(deepspeed.__version__)"

RUN \
# Show what we have
pip --version && \
pip list && \
python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
python -c "import horovod.torch"
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__"