From cab183d6ba6d3ca062a4a13993fe9139e7d8c238 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 23 Apr 2024 22:19:25 +0200 Subject: [PATCH 1/9] DON'T MERGE {2023.06}[foss/2023b] wget v1.21.4 --- .../pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml new file mode 100644 index 0000000000..2ad61e06cf --- /dev/null +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml @@ -0,0 +1,2 @@ +easyconfigs: + - wget-1.21.4-GCCcore-13.2.0.eb From 363890c0eadc47d824f5ceda1f241058db043aa5 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sat, 27 Apr 2024 12:59:43 +0200 Subject: [PATCH 2/9] add py-cpuinfo instead of wget --- .../2023.06/eessi-2023.06-eb-4.9.1-2023b.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml index 2ad61e06cf..e617f8b49c 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml @@ -1,2 +1,7 @@ easyconfigs: - - wget-1.21.4-GCCcore-13.2.0.eb + # wget got ingested already + # - wget-1.21.4-GCCcore-13.2.0.eb + - py-cpuinfo-9.0.0-GCCcore-13.2.0.eb: + options: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20125 + from-commit: 6515b44cd84a20fe7876cb4bdaf3c0080e688566 From b87cf836356eba0cf001ab76d1d7396cf285153d Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sat, 27 Apr 2024 15:28:29 +0200 Subject: [PATCH 3/9] only use install MODE for nvidia when building --- bot/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index 12c849205d..060dfd8233 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -223,7 +223,7 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR} BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -BUILD_STEP_ARGS+=("--nvidia" "all") +BUILD_STEP_ARGS+=("--nvidia" "install") if [[ ! -z ${SHARED_FS_PATH} ]]; then BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections") fi From 08a6f0c08076c486c1f0821e8b42fdd16cdf5fd9 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 28 Apr 2024 09:19:31 +0200 Subject: [PATCH 4/9] work around failing sanity check while installing CUDA under host_injections --- .../nvidia/install_cuda_host_injections.sh | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh index a9310d817a..0ad4c9eb0c 100755 --- a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -186,6 +186,24 @@ else fatal_error "${error}" fi + # need to temporarily overwrite arch-specific SitePackage.lua or installation + # might fail in sanity check + mkdir -p ${cuda_install_parent}/.lmod + if [ -f ${cuda_install_parent}/.lmod/SitePackage.lua ]; then + mv ${cuda_install_parent}/.lmod/SitePackage.lua bkup-xyz-SitePackage.lua + fi + cat < ${cuda_install_parent}/.lmod/SitePackage.lua +require("strict") +local hook = require("Hook") +local open = io.open + +function arch_specific_load_hook(t) + LmodMessage("Ignoring ${EESSI_SOFTWARE_PATH}/.lmod/SitePackage.lua to allow for installing CUDA/12.1.1 under host_injections") +end + +hook.register("load", arch_specific_load_hook) +EOF + # We need the --rebuild option, as the CUDA module may or may not be on the # `MODULEPATH` yet. Even if it is, we still want to redo this installation # since it will provide the symlinked targets for the parts of the CUDA @@ -199,6 +217,12 @@ else # shellcheck disable=SC2086 # Intended splitting of extra_args eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" ret=$? + + # restore original arch-specific SitePackage.lua if any was present + if [ -f bkup-xyz-SitePackage.lua ]; then + mv bkup-xyz-SitePackage.lua ${cuda_install_parent}/.lmod/SitePackage.lua + fi + if [ $ret -ne 0 ]; then eb_last_log=$(unset EB_VERBOSE; eb --last-log) cp -a ${eb_last_log} . From a97f89cd3ad744bf0c1e0e5a453d2b80f9076bb0 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 28 Apr 2024 09:20:55 +0200 Subject: [PATCH 5/9] revert mode back to 'all' --- bot/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index 060dfd8233..12c849205d 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -223,7 +223,7 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR} BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -BUILD_STEP_ARGS+=("--nvidia" "install") +BUILD_STEP_ARGS+=("--nvidia" "all") if [[ ! -z ${SHARED_FS_PATH} ]]; then BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections") fi From 8b23e286aaa3912224b2556210ca9ac93cab65a0 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 28 Apr 2024 09:36:01 +0200 Subject: [PATCH 6/9] fix easystack syntax err --- .../pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml index e617f8b49c..333331e51c 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml @@ -2,6 +2,6 @@ easyconfigs: # wget got ingested already # - wget-1.21.4-GCCcore-13.2.0.eb - py-cpuinfo-9.0.0-GCCcore-13.2.0.eb: - options: - # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20125 - from-commit: 6515b44cd84a20fe7876cb4bdaf3c0080e688566 + options: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20125 + from-commit: 6515b44cd84a20fe7876cb4bdaf3c0080e688566 From 718ce0bac652321174d60f03236346498d26a2c9 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 28 Apr 2024 09:46:50 +0200 Subject: [PATCH 7/9] remove temporary SitePackage.lua --- scripts/gpu_support/nvidia/install_cuda_host_injections.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh index 0ad4c9eb0c..ed3132e2eb 100755 --- a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -218,6 +218,9 @@ EOF eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" ret=$? + # remove temporary SitePackage.lua + rm ${cuda_install_parent}/.lmod/SitePackage.lua + # restore original arch-specific SitePackage.lua if any was present if [ -f bkup-xyz-SitePackage.lua ]; then mv bkup-xyz-SitePackage.lua ${cuda_install_parent}/.lmod/SitePackage.lua From b58d9ecd3d9f280521ceb96fe41969ed2ceebd44 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Mon, 29 Apr 2024 11:33:58 +0200 Subject: [PATCH 8/9] ignore cuda Lmod hook if module outside known EESSI locations --- create_lmodsitepackage.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 9a4a232863..8364d32c5b 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -19,6 +19,34 @@ return content end +local function from_eessi_prefix(t) + -- eessi_prefix is the prefix with official EESSI modules + -- e.g. /cvmfs/software.eessi.io/versions/2023.06 + local eessi_prefix = os.getenv("EESSI_PREFIX") + + -- If EESSI_PREFIX wasn't defined, we cannot check if this module was from the EESSI environment + -- In that case, we assume it isn't, otherwise EESSI_PREFIX would (probably) have been set + if eessi_prefix == nil then + return False + else + -- NOTE: exact paths for site and user extensions aren't final, so may need to be updated later. + -- See https://github.com/EESSI/software-layer/pull/371 + + -- eessi_prefix_host_injections is the prefix with site-extensions (i.e. additional modules) + -- to the official EESSI modules, e.g. /cvmfs/software.eessi.io/host_injections/2023.06 + local eessi_prefix_host_injections = string.gsub(eessi_prefix, 'versions', 'host_injections') + + -- eessi_prefix_user_home is the prefix with user-extensions (i.e. additional modules) + -- to the official EESSI modules, e.g. $HOME/eessi/versions/2023.06 + local eessi_prefix_user_home = string.gsub(eessi_prefix, os.getenv("EESSI_CVMFS_REPO"), pathJoin(os.getenv("HOME"), "eessi")) + + -- Check if the full modulepath starts with the eessi_prefix_* + return string.find(t.fn, "^" .. eessi_prefix) ~= nil or + string.find(t.fn, "^" .. eessi_prefix_host_injections) ~= nil or + string.find(t.fn, "^" .. eessi_prefix_user_home) ~= nil + end +end + local function eessi_cuda_enabled_load_hook(t) local frameStk = require("FrameStk"):singleton() local mt = frameStk:mt() @@ -107,7 +135,11 @@ -- Combine both functions into a single one, as we can only register one function as load hook in lmod -- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed function eessi_load_hook(t) - eessi_cuda_enabled_load_hook(t) + -- Only apply CUDA hooks if the loaded module is in the EESSI prefix + -- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack + if from_eesi_prefix(t) then + eessi_cuda_enabled_load_hook(t) + end eessi_openmpi_load_hook(t) end From 680da9e61b41b86958e259db64921ecc9ad7f9de Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Mon, 29 Apr 2024 11:35:29 +0200 Subject: [PATCH 9/9] remove workaround for failing CUDA install --- .../nvidia/install_cuda_host_injections.sh | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh index ed3132e2eb..a9310d817a 100755 --- a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -186,24 +186,6 @@ else fatal_error "${error}" fi - # need to temporarily overwrite arch-specific SitePackage.lua or installation - # might fail in sanity check - mkdir -p ${cuda_install_parent}/.lmod - if [ -f ${cuda_install_parent}/.lmod/SitePackage.lua ]; then - mv ${cuda_install_parent}/.lmod/SitePackage.lua bkup-xyz-SitePackage.lua - fi - cat < ${cuda_install_parent}/.lmod/SitePackage.lua -require("strict") -local hook = require("Hook") -local open = io.open - -function arch_specific_load_hook(t) - LmodMessage("Ignoring ${EESSI_SOFTWARE_PATH}/.lmod/SitePackage.lua to allow for installing CUDA/12.1.1 under host_injections") -end - -hook.register("load", arch_specific_load_hook) -EOF - # We need the --rebuild option, as the CUDA module may or may not be on the # `MODULEPATH` yet. Even if it is, we still want to redo this installation # since it will provide the symlinked targets for the parts of the CUDA @@ -217,15 +199,6 @@ EOF # shellcheck disable=SC2086 # Intended splitting of extra_args eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" ret=$? - - # remove temporary SitePackage.lua - rm ${cuda_install_parent}/.lmod/SitePackage.lua - - # restore original arch-specific SitePackage.lua if any was present - if [ -f bkup-xyz-SitePackage.lua ]; then - mv bkup-xyz-SitePackage.lua ${cuda_install_parent}/.lmod/SitePackage.lua - fi - if [ $ret -ne 0 ]; then eb_last_log=$(unset EB_VERBOSE; eb --last-log) cp -a ${eb_last_log} .