From 21783b56d7787fb8e8533a78fc31b45a46e40c0b Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 14 Mar 2022 10:48:22 +0100 Subject: [PATCH 1/6] Horovod w. MPI --- dockers/base-conda/Dockerfile | 7 ++++--- dockers/base-cuda/Dockerfile | 35 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 85d5ed345af10..733b631d5fac1 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -122,8 +122,8 @@ ENV \ HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ - HOROVOD_WITH_GLOO=1 \ - HOROVOD_WITHOUT_MPI=1 + HOROVOD_WITHOUT_GLOO=1 \ + HOROVOD_WITH_MPI=1 RUN \ HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ @@ -154,4 +154,5 @@ RUN \ pip list && \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python -c "import horovod.torch" + python -c "import horovod.torch" && \ + python -c "from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 739ff591eb062..5502cdf381459 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -90,14 +90,14 @@ RUN \ rm assistant.py RUN \ - apt-get purge -y cmake && \ - wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \ - tar -zxvf cmake-3.20.2.tar.gz && \ - cd cmake-3.20.2 && \ - ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \ - make && \ - make install && \ - cmake --version + apt-get purge -y cmake && \ + wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \ + tar -zxvf cmake-3.20.2.tar.gz && \ + cd cmake-3.20.2 && \ + ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \ + make && \ + make install && \ + cmake --version ENV \ HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ @@ -105,16 +105,16 @@ ENV \ HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ - HOROVOD_WITH_GLOO=1 \ - HOROVOD_WITHOUT_MPI=1 + HOROVOD_WITHOUT_GLOO=1 \ + HOROVOD_WITH_MPI=1 RUN \ - HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ - export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ - cat ./requirements/horovod.txt && \ - cmake --version && \ - pip install --no-cache-dir -r ./requirements/horovod.txt && \ - rm -rf requirements/ + HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ + export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ + cat ./requirements/horovod.txt && \ + cmake --version && \ + pip install --no-cache-dir -r ./requirements/horovod.txt && \ + rm -rf requirements/ RUN \ CUDA_VERSION_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])") && \ @@ -147,4 +147,5 @@ RUN \ pip list && \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python -c "import horovod.torch" + python -c "import horovod.torch" && \ + python -c "from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built" From 011bf4cbd96d586b186eb4c89165542f1b7f963b Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 14 Mar 2022 10:53:08 +0100 Subject: [PATCH 2/6] GLOO --- dockers/base-conda/Dockerfile | 3 ++- dockers/base-cuda/Dockerfile | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 733b631d5fac1..e3e57ffc32f22 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -122,7 +122,7 @@ ENV \ HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ - HOROVOD_WITHOUT_GLOO=1 \ + HOROVOD_WITH_GLOO=1 \ HOROVOD_WITH_MPI=1 RUN \ @@ -155,4 +155,5 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python -c "import horovod.torch" && \ + python -c "from horovod.torch.mpi_ops import gloo_enabled, gloo_built" && \ python -c "from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 5502cdf381459..bdd4072c66883 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -105,7 +105,7 @@ ENV \ HOROVOD_WITH_PYTORCH=1 \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ - HOROVOD_WITHOUT_GLOO=1 \ + HOROVOD_WITH_GLOO=1 \ HOROVOD_WITH_MPI=1 RUN \ @@ -148,4 +148,5 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python -c "import horovod.torch" && \ + python -c "from horovod.torch.mpi_ops import gloo_enabled, gloo_built" && \ python -c "from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built" From 6b0f4ed21bbecc25c48156f605b7585a97f580cf Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 14 Mar 2022 13:54:47 +0100 Subject: [PATCH 3/6] mpi_ops --- dockers/base-conda/Dockerfile | 3 +-- dockers/base-cuda/Dockerfile | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index e3e57ffc32f22..1f68a875d5836 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -155,5 +155,4 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python -c "import horovod.torch" && \ - python -c "from horovod.torch.mpi_ops import gloo_enabled, gloo_built" && \ - python -c "from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built" + python -c "from horovod.torch.mpi_ops import *" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index bdd4072c66883..fe5b57bd070bd 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -148,5 +148,4 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python -c "import horovod.torch" && \ - python -c "from horovod.torch.mpi_ops import gloo_enabled, gloo_built" && \ - python -c "from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built" + python -c "from horovod.torch.mpi_ops import *" From 93d7a82f0c32f5b58754622a5b75dce7bbc18ce9 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 17 Mar 2022 19:20:40 +0100 Subject: [PATCH 4/6] nccl_built --- dockers/base-cuda/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index fe5b57bd070bd..352fc4fdf31da 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -106,14 +106,14 @@ ENV \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ HOROVOD_WITH_GLOO=1 \ - HOROVOD_WITH_MPI=1 + HOROVOD_WITHout_MPI=1 RUN \ HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ cat ./requirements/horovod.txt && \ cmake --version && \ - pip install --no-cache-dir -r ./requirements/horovod.txt && \ + pip install --no-cache-dir -v -r ./requirements/horovod.txt && \ rm -rf requirements/ RUN \ @@ -148,4 +148,4 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python -c "import horovod.torch" && \ - python -c "from horovod.torch.mpi_ops import *" + python -c "from horovod.torch import nccl_built; nccl_built()" From 1c7a1ba21a4ca803abcdd7a1cd3bedaca81169ea Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 18 Mar 2022 08:51:30 +0100 Subject: [PATCH 5/6] fix --- dockers/base-conda/Dockerfile | 2 +- dockers/base-cuda/Dockerfile | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 1f68a875d5836..280ab0687ffed 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -155,4 +155,4 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python -c "import horovod.torch" && \ - python -c "from horovod.torch.mpi_ops import *" + python -c "from horovod.torch import nccl_built; nccl_built()" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 352fc4fdf31da..0f202b7c27225 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -83,7 +83,7 @@ RUN \ python ./requirements/adjust-versions.py requirements/extra.txt ${PYTORCH_VERSION} && \ python ./requirements/adjust-versions.py requirements/examples.txt ${PYTORCH_VERSION} && \ python -c "print(' '.join([ln for ln in open('requirements/extra.txt').readlines() if 'horovod' in ln]))" > ./requirements/horovod.txt && \ - python assistant.py requirements_prune_pkgs requirements/examples.txt "horovod" && \ + python assistant.py requirements_prune_pkgs requirements/extra.txt "horovod" && \ # Install all requirements \ pip install -r requirements/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \ rm -rf requirements.* && \ @@ -106,7 +106,7 @@ ENV \ HOROVOD_WITHOUT_TENSORFLOW=1 \ HOROVOD_WITHOUT_MXNET=1 \ HOROVOD_WITH_GLOO=1 \ - HOROVOD_WITHout_MPI=1 + HOROVOD_WITH_MPI=1 RUN \ HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ From c1099aad875e33564c11db764a0751eade07200a Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 18 Mar 2022 08:56:48 +0100 Subject: [PATCH 6/6] -v --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 0f202b7c27225..bf57fb91025fc 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -113,7 +113,7 @@ RUN \ export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ cat ./requirements/horovod.txt && \ cmake --version && \ - pip install --no-cache-dir -v -r ./requirements/horovod.txt && \ + pip install --no-cache-dir -r ./requirements/horovod.txt && \ rm -rf requirements/ RUN \