diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 0a32aede3489c..f6a274510437b 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -17,9 +17,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran - ".github/workflows/events-nightly.yml" - "setup.py" -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} +# FIXME +# concurrency: +# group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} +# cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} jobs: build-PL: @@ -43,7 +44,7 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} file: dockers/release/Dockerfile push: false - timeout-minutes: 50 + timeout-minutes: 70 # FIXME build-XLA: runs-on: ubuntu-20.04 diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 739ff591eb062..b820c0383b468 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -113,38 +113,17 @@ RUN \ export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ cat ./requirements/horovod.txt && \ cmake --version && \ - pip install --no-cache-dir -r ./requirements/horovod.txt && \ + pip install --no-cache-dir "horovod==0.24.1" && \ + PYTHONUNBUFFERED=1 horovodrun --check-build && \ + # pip uninstall -y horovod && \ + # pip install --no-cache-dir -r ./requirements/horovod.txt && \ + # PYTHONUNBUFFERED=1 horovodrun --check-build && \ + python -c "from horovod.torch import nccl_built; nccl_built()" && \ rm -rf requirements/ -RUN \ - CUDA_VERSION_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])") && \ - py_ver=$(python -c "print(int('$PYTHON_VERSION'.split('.') >= '3.9'.split('.')))") && \ - # install DALI, needed for examples - # todo: waiting for 1.4 - https://github.com/NVIDIA/DALI/issues/3144#issuecomment-877386691 - if [ $py_ver -eq "0" ]; then \ - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda${CUDA_VERSION_MAJOR}0>1.0" ; \ - python -c 'from nvidia.dali.pipeline import Pipeline' ; \ - fi - -RUN \ - # install NVIDIA apex - pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \ - python -c "from apex import amp" - -RUN \ - # install FairScale - pip install fairscale==0.4.0 && \ - python -c "import fairscale; print(fairscale.__version__)" - -RUN \ - # install DeepSpeed - pip install deepspeed==0.5.7 && \ - python -c "import deepspeed; print(deepspeed.__version__)" - RUN \ # Show what we have pip --version && \ pip list && \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ - python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python -c "import horovod.torch" + python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__"