From effe403ac1bfbdc78a4aa14577585ae869ed10e2 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 17 Mar 2022 14:04:56 +0900 Subject: [PATCH 01/22] horovodrun --check-build --- dockers/base-cuda/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 739ff591eb062..ccd183aca724a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -114,6 +114,7 @@ RUN \ cat ./requirements/horovod.txt && \ cmake --version && \ pip install --no-cache-dir -r ./requirements/horovod.txt && \ + horovodrun --check-build && \ rm -rf requirements/ RUN \ From 9fa4e215647d3f97ee2bc406574acc35e566f7a7 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 17 Mar 2022 14:05:18 +0900 Subject: [PATCH 02/22] rm concurrency limit --- .github/workflows/ci_dockers.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 0a32aede3489c..1bc2bc3bfb916 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -17,9 +17,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran - ".github/workflows/events-nightly.yml" - "setup.py" -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} +# FIXME +# concurrency: +# group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} +# cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} jobs: build-PL: From d321e1f7cd02f3bb75cf6d1f86ca63d685420cb1 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 17 Mar 2022 14:07:37 +0900 Subject: [PATCH 03/22] Reinstall horovod --- dockers/base-cuda/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index ccd183aca724a..58d6a186566c7 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -115,6 +115,9 @@ RUN \ cmake --version && \ pip install --no-cache-dir -r ./requirements/horovod.txt && \ horovodrun --check-build && \ + pip uninstall -y horovod && \ + pip install --no-cache-dir -r ./requirements/horovod.txt && \ + horovodrun --check-build && \ rm -rf requirements/ RUN \ From c979144f67bf32679b92f716e8618110c81cb50b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 17 Mar 2022 14:52:57 +0900 Subject: [PATCH 04/22] Extend timeout --- .github/workflows/ci_dockers.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 1bc2bc3bfb916..f6a274510437b 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -44,7 +44,7 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} file: dockers/release/Dockerfile push: false - timeout-minutes: 50 + timeout-minutes: 70 # FIXME build-XLA: runs-on: ubuntu-20.04 From 90b1b2162821e2b250f857aa7dc81f7595363974 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 03:19:35 +0900 Subject: [PATCH 05/22] set back concurrency --- .github/workflows/ci_dockers.yml | 7 +++---- .github/workflows/events-nightly.yml | 2 ++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index f6a274510437b..b63e71f5da0c5 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -17,10 +17,9 @@ on: # Trigger the workflow on push or pull request, but only for the master bran - ".github/workflows/events-nightly.yml" - "setup.py" -# FIXME -# concurrency: -# group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} -# cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} jobs: build-PL: diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 9704139d1da78..cc10eeb11f1c0 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -6,6 +6,8 @@ on: schedule: # At the end of every day - cron: "0 0 * * *" + # FIXME + push: {} env: PUSH_TO_HUB: true From 56eb55459fbbc3378613ee6a712974985861b7e9 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 04:33:55 +0900 Subject: [PATCH 06/22] revert docker push --- .github/workflows/events-nightly.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index cc10eeb11f1c0..9704139d1da78 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -6,8 +6,6 @@ on: schedule: # At the end of every day - cron: "0 0 * * *" - # FIXME - push: {} env: PUSH_TO_HUB: true From d1b3e1592164182860b90f6cd6165e8656723156 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 06:23:33 +0900 Subject: [PATCH 07/22] Skip test --- tests/callbacks/test_pruning.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 1b979d3f865fe..f6feebd919a59 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -174,6 +174,7 @@ def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructur ) +@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI") @RunIf(min_gpus=2, skip_windows=True) def test_pruning_callback_ddp_spawn(tmpdir): train_with_pruning_callback( From 68a13c66383ccc13e98591e8bf90ccc0afa54cb3 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 06:35:44 +0900 Subject: [PATCH 08/22] Skip test --- tests/callbacks/test_quantization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py index 2f146d9a1dd19..7f3da9eca2249 100644 --- a/tests/callbacks/test_quantization.py +++ b/tests/callbacks/test_quantization.py @@ -35,6 +35,7 @@ from torch.quantization import FakeQuantize as FakeQuantizeBase +@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI") @pytest.mark.parametrize("observe", ["average", "histogram"]) @pytest.mark.parametrize("fuse", [True, False]) @pytest.mark.parametrize("convert", [True, False]) From a879a504c37a35bbe939e5532c23370fc82dabe6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 07:32:45 +0900 Subject: [PATCH 09/22] Pin docker image sha --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 0726c6cf396cf..07ccbcf4fe5df 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -29,7 +29,7 @@ jobs: container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 # run on torch 1.8 as it's the LTS version - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" From d2351ea6de3a441ca4fd119afa26e64227b7c77d Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 07:43:06 +0900 Subject: [PATCH 10/22] Fix syntax --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 07ccbcf4fe5df..4ba7efad4bb32 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -29,7 +29,7 @@ jobs: container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 # run on torch 1.8 as it's the LTS version - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" From 7251f799597b00e47441152e2ded3f72a3b9e86b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 09:30:14 +0900 Subject: [PATCH 11/22] Add comment and update sha256 in gpu-benchmark.yml --- .azure-pipelines/gpu-benchmark.yml | 4 ++-- .azure-pipelines/gpu-tests.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml index 6d45cc2f4566a..2cbc32c513859 100644 --- a/.azure-pipelines/gpu-benchmark.yml +++ b/.azure-pipelines/gpu-benchmark.yml @@ -28,8 +28,8 @@ jobs: cancelTimeoutInMinutes: "2" pool: gridai-spot-pool container: - # should match the one in '.azure-pipelines/gpu-benchmark.yml' - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" + # TODO: Unpin sha256 + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 4ba7efad4bb32..7c19a0ba8c7ee 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -29,6 +29,7 @@ jobs: container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 # run on torch 1.8 as it's the LTS version + # TODO: Unpin sha256 image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' From 947fd383ad08781edf0a9705701ceb0637d11596 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 09:50:38 +0900 Subject: [PATCH 12/22] Revert "Skip test" This reverts commit 68a13c66383ccc13e98591e8bf90ccc0afa54cb3. --- tests/callbacks/test_quantization.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py index 7f3da9eca2249..2f146d9a1dd19 100644 --- a/tests/callbacks/test_quantization.py +++ b/tests/callbacks/test_quantization.py @@ -35,7 +35,6 @@ from torch.quantization import FakeQuantize as FakeQuantizeBase -@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI") @pytest.mark.parametrize("observe", ["average", "histogram"]) @pytest.mark.parametrize("fuse", [True, False]) @pytest.mark.parametrize("convert", [True, False]) From bd9d4550ebb3edbf72dd113b91affa69ac927ef9 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 09:50:44 +0900 Subject: [PATCH 13/22] Revert "Skip test" This reverts commit d1b3e1592164182860b90f6cd6165e8656723156. --- tests/callbacks/test_pruning.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index f6feebd919a59..1b979d3f865fe 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -174,7 +174,6 @@ def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructur ) -@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI") @RunIf(min_gpus=2, skip_windows=True) def test_pruning_callback_ddp_spawn(tmpdir): train_with_pruning_callback( From c6f65ddadbe3b6958e83d4d83b8cf8a678b51eb8 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 11:05:07 +0900 Subject: [PATCH 14/22] Fail fast on horovod installation --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 58d6a186566c7..eaab3826f6d67 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -118,6 +118,7 @@ RUN \ pip uninstall -y horovod && \ pip install --no-cache-dir -r ./requirements/horovod.txt && \ horovodrun --check-build && \ + python -c "from horovod.torch import nccl_built; nccl_built()" rm -rf requirements/ RUN \ @@ -151,4 +152,3 @@ RUN \ pip list && \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python -c "import horovod.torch" From 2ffbc0392825f7849392d23874432930efcb2510 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 11:19:58 +0900 Subject: [PATCH 15/22] Fix syntax --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index eaab3826f6d67..15603c2301f40 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -151,4 +151,4 @@ RUN \ pip --version && \ pip list && \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ - python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ + python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" From 9541715791b432f5f9db933653bca2296ffa856f Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 11:28:17 +0900 Subject: [PATCH 16/22] No more syntax error please --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 15603c2301f40..30cefdeb84e4a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -118,7 +118,7 @@ RUN \ pip uninstall -y horovod && \ pip install --no-cache-dir -r ./requirements/horovod.txt && \ horovodrun --check-build && \ - python -c "from horovod.torch import nccl_built; nccl_built()" + python -c "from horovod.torch import nccl_built; nccl_built()" && \ rm -rf requirements/ RUN \ From 291c09c1cb0b2269d561417778ea26bf0b04d819 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 17:00:43 +0900 Subject: [PATCH 17/22] Simplify Dockerfile for debugging --- dockers/base-cuda/Dockerfile | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 30cefdeb84e4a..2636ece070c3a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -121,31 +121,6 @@ RUN \ python -c "from horovod.torch import nccl_built; nccl_built()" && \ rm -rf requirements/ -RUN \ - CUDA_VERSION_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])") && \ - py_ver=$(python -c "print(int('$PYTHON_VERSION'.split('.') >= '3.9'.split('.')))") && \ - # install DALI, needed for examples - # todo: waiting for 1.4 - https://github.com/NVIDIA/DALI/issues/3144#issuecomment-877386691 - if [ $py_ver -eq "0" ]; then \ - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda${CUDA_VERSION_MAJOR}0>1.0" ; \ - python -c 'from nvidia.dali.pipeline import Pipeline' ; \ - fi - -RUN \ - # install NVIDIA apex - pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \ - python -c "from apex import amp" - -RUN \ - # install FairScale - pip install fairscale==0.4.0 && \ - python -c "import fairscale; print(fairscale.__version__)" - -RUN \ - # install DeepSpeed - pip install deepspeed==0.5.7 && \ - python -c "import deepspeed; print(deepspeed.__version__)" - RUN \ # Show what we have pip --version && \ From de2a7f109b54001b87e30d5d8883024b06bd57d4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 17:01:39 +0900 Subject: [PATCH 18/22] Dump check-build to log output --- dockers/base-cuda/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 2636ece070c3a..3fae363708fe8 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -114,10 +114,10 @@ RUN \ cat ./requirements/horovod.txt && \ cmake --version && \ pip install --no-cache-dir -r ./requirements/horovod.txt && \ - horovodrun --check-build && \ + PYTHONUNBUFFERED=1 horovodrun --check-build && \ pip uninstall -y horovod && \ pip install --no-cache-dir -r ./requirements/horovod.txt && \ - horovodrun --check-build && \ + PYTHONUNBUFFERED=1 horovodrun --check-build && \ python -c "from horovod.torch import nccl_built; nccl_built()" && \ rm -rf requirements/ From c36276c2e525ff23eaac07fa211cc00acb80c877 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 17:02:29 +0900 Subject: [PATCH 19/22] rm concurrency limit --- .github/workflows/ci_dockers.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index b63e71f5da0c5..f6a274510437b 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -17,9 +17,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran - ".github/workflows/events-nightly.yml" - "setup.py" -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} +# FIXME +# concurrency: +# group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} +# cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} jobs: build-PL: From cf11dca2bfa1d09aa9298557f3cf50545347de00 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 17:04:58 +0900 Subject: [PATCH 20/22] Comment reinstallation --- dockers/base-cuda/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 3fae363708fe8..5f53117e5791f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -115,9 +115,9 @@ RUN \ cmake --version && \ pip install --no-cache-dir -r ./requirements/horovod.txt && \ PYTHONUNBUFFERED=1 horovodrun --check-build && \ - pip uninstall -y horovod && \ - pip install --no-cache-dir -r ./requirements/horovod.txt && \ - PYTHONUNBUFFERED=1 horovodrun --check-build && \ + # pip uninstall -y horovod && \ + # pip install --no-cache-dir -r ./requirements/horovod.txt && \ + # PYTHONUNBUFFERED=1 horovodrun --check-build && \ python -c "from horovod.torch import nccl_built; nccl_built()" && \ rm -rf requirements/ From 707b7cf2570e5a0dac85aedbe1a7e2fbee3b8ff5 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 17:05:38 +0900 Subject: [PATCH 21/22] Pin horovod==0.24.2 --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 5f53117e5791f..07529a0f63f1e 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -113,7 +113,7 @@ RUN \ export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ cat ./requirements/horovod.txt && \ cmake --version && \ - pip install --no-cache-dir -r ./requirements/horovod.txt && \ + pip install --no-cache-dir "horovod==0.24.2" && \ PYTHONUNBUFFERED=1 horovodrun --check-build && \ # pip uninstall -y horovod && \ # pip install --no-cache-dir -r ./requirements/horovod.txt && \ From d57d637c741cebfcb313a6e80281ca921ce8ddd9 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 18 Mar 2022 17:09:55 +0900 Subject: [PATCH 22/22] Pin horovod==0.24.1 --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 07529a0f63f1e..b820c0383b468 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -113,7 +113,7 @@ RUN \ export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ cat ./requirements/horovod.txt && \ cmake --version && \ - pip install --no-cache-dir "horovod==0.24.2" && \ + pip install --no-cache-dir "horovod==0.24.1" && \ PYTHONUNBUFFERED=1 horovodrun --check-build && \ # pip uninstall -y horovod && \ # pip install --no-cache-dir -r ./requirements/horovod.txt && \