From effe403ac1bfbdc78a4aa14577585ae869ed10e2 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 17 Mar 2022 14:04:56 +0900
Subject: [PATCH 01/22] horovodrun --check-build

---
 dockers/base-cuda/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 739ff591eb062..ccd183aca724a 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -114,6 +114,7 @@ RUN \
    cat ./requirements/horovod.txt && \
    cmake --version && \
    pip install --no-cache-dir -r ./requirements/horovod.txt && \
+   horovodrun --check-build && \
    rm -rf requirements/
 
 RUN \

From 9fa4e215647d3f97ee2bc406574acc35e566f7a7 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 17 Mar 2022 14:05:18 +0900
Subject: [PATCH 02/22] rm concurrency limit

---
 .github/workflows/ci_dockers.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 0a32aede3489c..1bc2bc3bfb916 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -17,9 +17,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran
       - ".github/workflows/events-nightly.yml"
       - "setup.py"
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
-  cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
+# FIXME
+# concurrency:
+#   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
+#   cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
 
 jobs:
   build-PL:

From d321e1f7cd02f3bb75cf6d1f86ca63d685420cb1 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 17 Mar 2022 14:07:37 +0900
Subject: [PATCH 03/22] Reinstall horovod

---
 dockers/base-cuda/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index ccd183aca724a..58d6a186566c7 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -115,6 +115,9 @@ RUN \
    cmake --version && \
    pip install --no-cache-dir -r ./requirements/horovod.txt && \
    horovodrun --check-build && \
+   pip uninstall -y horovod && \
+   pip install --no-cache-dir -r ./requirements/horovod.txt && \
+   horovodrun --check-build && \
    rm -rf requirements/
 
 RUN \

From c979144f67bf32679b92f716e8618110c81cb50b Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 17 Mar 2022 14:52:57 +0900
Subject: [PATCH 04/22] Extend timeout

---
 .github/workflows/ci_dockers.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 1bc2bc3bfb916..f6a274510437b 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -44,7 +44,7 @@ jobs:
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
           file: dockers/release/Dockerfile
           push: false
-        timeout-minutes: 50
+        timeout-minutes: 70  # FIXME
 
   build-XLA:
     runs-on: ubuntu-20.04

From 90b1b2162821e2b250f857aa7dc81f7595363974 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 03:19:35 +0900
Subject: [PATCH 05/22] set back concurrency

---
 .github/workflows/ci_dockers.yml     | 7 +++----
 .github/workflows/events-nightly.yml | 2 ++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index f6a274510437b..b63e71f5da0c5 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -17,10 +17,9 @@ on: # Trigger the workflow on push or pull request, but only for the master bran
       - ".github/workflows/events-nightly.yml"
       - "setup.py"
 
-# FIXME
-# concurrency:
-#   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
-#   cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
+  cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
 
 jobs:
   build-PL:
diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 9704139d1da78..cc10eeb11f1c0 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -6,6 +6,8 @@ on:
   schedule:
     # At the end of every day
     - cron: "0 0 * * *"
+  # FIXME
+  push: {}
 
 env:
   PUSH_TO_HUB: true

From 56eb55459fbbc3378613ee6a712974985861b7e9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 04:33:55 +0900
Subject: [PATCH 06/22] revert docker push

---
 .github/workflows/events-nightly.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index cc10eeb11f1c0..9704139d1da78 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -6,8 +6,6 @@ on:
   schedule:
     # At the end of every day
     - cron: "0 0 * * *"
-  # FIXME
-  push: {}
 
 env:
   PUSH_TO_HUB: true

From d1b3e1592164182860b90f6cd6165e8656723156 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 06:23:33 +0900
Subject: [PATCH 07/22] Skip test

---
 tests/callbacks/test_pruning.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index 1b979d3f865fe..f6feebd919a59 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -174,6 +174,7 @@ def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructur
     )
 
 
+@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI")
 @RunIf(min_gpus=2, skip_windows=True)
 def test_pruning_callback_ddp_spawn(tmpdir):
     train_with_pruning_callback(

From 68a13c66383ccc13e98591e8bf90ccc0afa54cb3 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 06:35:44 +0900
Subject: [PATCH 08/22] Skip test

---
 tests/callbacks/test_quantization.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py
index 2f146d9a1dd19..7f3da9eca2249 100644
--- a/tests/callbacks/test_quantization.py
+++ b/tests/callbacks/test_quantization.py
@@ -35,6 +35,7 @@
     from torch.quantization import FakeQuantize as FakeQuantizeBase
 
 
+@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI")
 @pytest.mark.parametrize("observe", ["average", "histogram"])
 @pytest.mark.parametrize("fuse", [True, False])
 @pytest.mark.parametrize("convert", [True, False])

From a879a504c37a35bbe939e5532c23370fc82dabe6 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 07:32:45 +0900
Subject: [PATCH 09/22] Pin docker image sha

---
 .azure-pipelines/gpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 0726c6cf396cf..07ccbcf4fe5df 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -29,7 +29,7 @@ jobs:
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
       # run on torch 1.8 as it's the LTS version
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"

From d2351ea6de3a441ca4fd119afa26e64227b7c77d Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 07:43:06 +0900
Subject: [PATCH 10/22] Fix syntax

---
 .azure-pipelines/gpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 07ccbcf4fe5df..4ba7efad4bb32 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -29,7 +29,7 @@ jobs:
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
       # run on torch 1.8 as it's the LTS version
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"

From 7251f799597b00e47441152e2ded3f72a3b9e86b Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 09:30:14 +0900
Subject: [PATCH 11/22] Add comment and update sha256 in gpu-benchmark.yml

---
 .azure-pipelines/gpu-benchmark.yml | 4 ++--
 .azure-pipelines/gpu-tests.yml     | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml
index 6d45cc2f4566a..2cbc32c513859 100644
--- a/.azure-pipelines/gpu-benchmark.yml
+++ b/.azure-pipelines/gpu-benchmark.yml
@@ -28,8 +28,8 @@ jobs:
     cancelTimeoutInMinutes: "2"
     pool: gridai-spot-pool
     container:
-      # should match the one in '.azure-pipelines/gpu-benchmark.yml'
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      # TODO: Unpin sha256
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
     workspace:
       clean: all
diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 4ba7efad4bb32..7c19a0ba8c7ee 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -29,6 +29,7 @@ jobs:
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
       # run on torch 1.8 as it's the LTS version
+      # TODO: Unpin sha256
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'

From 947fd383ad08781edf0a9705701ceb0637d11596 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 09:50:38 +0900
Subject: [PATCH 12/22] Revert "Skip test"

This reverts commit 68a13c66383ccc13e98591e8bf90ccc0afa54cb3.
---
 tests/callbacks/test_quantization.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py
index 7f3da9eca2249..2f146d9a1dd19 100644
--- a/tests/callbacks/test_quantization.py
+++ b/tests/callbacks/test_quantization.py
@@ -35,7 +35,6 @@
     from torch.quantization import FakeQuantize as FakeQuantizeBase
 
 
-@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI")
 @pytest.mark.parametrize("observe", ["average", "histogram"])
 @pytest.mark.parametrize("fuse", [True, False])
 @pytest.mark.parametrize("convert", [True, False])

From bd9d4550ebb3edbf72dd113b91affa69ac927ef9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 09:50:44 +0900
Subject: [PATCH 13/22] Revert "Skip test"

This reverts commit d1b3e1592164182860b90f6cd6165e8656723156.
---
 tests/callbacks/test_pruning.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index f6feebd919a59..1b979d3f865fe 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -174,7 +174,6 @@ def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructur
     )
 
 
-@pytest.mark.skip(reason="TODO: Possible cause of segfaults in CI")
 @RunIf(min_gpus=2, skip_windows=True)
 def test_pruning_callback_ddp_spawn(tmpdir):
     train_with_pruning_callback(

From c6f65ddadbe3b6958e83d4d83b8cf8a678b51eb8 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 11:05:07 +0900
Subject: [PATCH 14/22] Fail fast on horovod installation

---
 dockers/base-cuda/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 58d6a186566c7..eaab3826f6d67 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -118,6 +118,7 @@ RUN \
    pip uninstall -y horovod && \
    pip install --no-cache-dir -r ./requirements/horovod.txt && \
    horovodrun --check-build && \
+   python -c "from horovod.torch import nccl_built; nccl_built()"
    rm -rf requirements/
 
 RUN \
@@ -151,4 +152,3 @@ RUN \
     pip list && \
     python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
     python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
-    python -c "import horovod.torch"

From 2ffbc0392825f7849392d23874432930efcb2510 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 11:19:58 +0900
Subject: [PATCH 15/22] Fix syntax

---
 dockers/base-cuda/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index eaab3826f6d67..15603c2301f40 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -151,4 +151,4 @@ RUN \
     pip --version && \
     pip list && \
     python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
-    python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
+    python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__"

From 9541715791b432f5f9db933653bca2296ffa856f Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 11:28:17 +0900
Subject: [PATCH 16/22] No more syntax error please

---
 dockers/base-cuda/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 15603c2301f40..30cefdeb84e4a 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -118,7 +118,7 @@ RUN \
    pip uninstall -y horovod && \
    pip install --no-cache-dir -r ./requirements/horovod.txt && \
    horovodrun --check-build && \
-   python -c "from horovod.torch import nccl_built; nccl_built()"
+   python -c "from horovod.torch import nccl_built; nccl_built()" && \
    rm -rf requirements/
 
 RUN \

From 291c09c1cb0b2269d561417778ea26bf0b04d819 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 17:00:43 +0900
Subject: [PATCH 17/22] Simplify Dockerfile for debugging

---
 dockers/base-cuda/Dockerfile | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 30cefdeb84e4a..2636ece070c3a 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -121,31 +121,6 @@ RUN \
    python -c "from horovod.torch import nccl_built; nccl_built()" && \
    rm -rf requirements/
 
-RUN \
-    CUDA_VERSION_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])") && \
-    py_ver=$(python -c "print(int('$PYTHON_VERSION'.split('.') >= '3.9'.split('.')))") && \
-    # install DALI, needed for examples
-    # todo: waiting for 1.4 - https://github.com/NVIDIA/DALI/issues/3144#issuecomment-877386691
-    if [ $py_ver -eq "0" ]; then \
-        pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda${CUDA_VERSION_MAJOR}0>1.0" ; \
-        python -c 'from nvidia.dali.pipeline import Pipeline' ; \
-    fi
-
-RUN \
-    # install NVIDIA apex
-    pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
-    python -c "from apex import amp"
-
-RUN \
-    # install FairScale
-    pip install fairscale==0.4.0 && \
-    python -c "import fairscale; print(fairscale.__version__)"
-
-RUN \
-    # install DeepSpeed
-    pip install deepspeed==0.5.7 && \
-    python -c "import deepspeed; print(deepspeed.__version__)"
-
 RUN \
     # Show what we have
     pip --version && \

From de2a7f109b54001b87e30d5d8883024b06bd57d4 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 17:01:39 +0900
Subject: [PATCH 18/22] Dump check-build to log output

---
 dockers/base-cuda/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 2636ece070c3a..3fae363708fe8 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -114,10 +114,10 @@ RUN \
    cat ./requirements/horovod.txt && \
    cmake --version && \
    pip install --no-cache-dir -r ./requirements/horovod.txt && \
-   horovodrun --check-build && \
+   PYTHONUNBUFFERED=1 horovodrun --check-build && \
    pip uninstall -y horovod && \
    pip install --no-cache-dir -r ./requirements/horovod.txt && \
-   horovodrun --check-build && \
+   PYTHONUNBUFFERED=1 horovodrun --check-build && \
    python -c "from horovod.torch import nccl_built; nccl_built()" && \
    rm -rf requirements/
 

From c36276c2e525ff23eaac07fa211cc00acb80c877 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 17:02:29 +0900
Subject: [PATCH 19/22] rm concurrency limit

---
 .github/workflows/ci_dockers.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index b63e71f5da0c5..f6a274510437b 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -17,9 +17,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran
       - ".github/workflows/events-nightly.yml"
       - "setup.py"
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
-  cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
+# FIXME
+# concurrency:
+#   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
+#   cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
 
 jobs:
   build-PL:

From cf11dca2bfa1d09aa9298557f3cf50545347de00 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 17:04:58 +0900
Subject: [PATCH 20/22] Comment reinstallation

---
 dockers/base-cuda/Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 3fae363708fe8..5f53117e5791f 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -115,9 +115,9 @@ RUN \
    cmake --version && \
    pip install --no-cache-dir -r ./requirements/horovod.txt && \
    PYTHONUNBUFFERED=1 horovodrun --check-build && \
-   pip uninstall -y horovod && \
-   pip install --no-cache-dir -r ./requirements/horovod.txt && \
-   PYTHONUNBUFFERED=1 horovodrun --check-build && \
+   # pip uninstall -y horovod && \
+   # pip install --no-cache-dir -r ./requirements/horovod.txt && \
+   # PYTHONUNBUFFERED=1 horovodrun --check-build && \
    python -c "from horovod.torch import nccl_built; nccl_built()" && \
    rm -rf requirements/
 

From 707b7cf2570e5a0dac85aedbe1a7e2fbee3b8ff5 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 17:05:38 +0900
Subject: [PATCH 21/22] Pin horovod==0.24.2

---
 dockers/base-cuda/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 5f53117e5791f..07529a0f63f1e 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -113,7 +113,7 @@ RUN \
    export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
    cat ./requirements/horovod.txt && \
    cmake --version && \
-   pip install --no-cache-dir -r ./requirements/horovod.txt && \
+   pip install --no-cache-dir "horovod==0.24.2" && \
    PYTHONUNBUFFERED=1 horovodrun --check-build && \
    # pip uninstall -y horovod && \
    # pip install --no-cache-dir -r ./requirements/horovod.txt && \

From d57d637c741cebfcb313a6e80281ca921ce8ddd9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 18 Mar 2022 17:09:55 +0900
Subject: [PATCH 22/22] Pin horovod==0.24.1

---
 dockers/base-cuda/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 07529a0f63f1e..b820c0383b468 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -113,7 +113,7 @@ RUN \
    export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
    cat ./requirements/horovod.txt && \
    cmake --version && \
-   pip install --no-cache-dir "horovod==0.24.2" && \
+   pip install --no-cache-dir "horovod==0.24.1" && \
    PYTHONUNBUFFERED=1 horovodrun --check-build && \
    # pip uninstall -y horovod && \
    # pip install --no-cache-dir -r ./requirements/horovod.txt && \