Skip to content

Commit 408dabf

Browse files
committed
Allow for CUDA compute capability fallbacks when initialising EESSI
1 parent 6e1dbce commit 408dabf

File tree

5 files changed

+146
-17
lines changed

5 files changed

+146
-17
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import os
2+
3+
import os
4+
import sys
5+
6+
class EnvVarError(Exception):
7+
"""Custom exception for environment variable comparison errors."""
8+
def __init__(self, message):
9+
super().__init__(f"ENV VALIDATION ERROR: {message}")
10+
11+
def get_env_vars(var1, var2):
12+
val1 = os.environ.get(var1)
13+
val2 = os.environ.get(var2)
14+
15+
if val1 is None:
16+
raise EnvVarError(f"Missing environment variable: '{var1}'")
17+
if val2 is None:
18+
raise EnvVarError(f"Missing environment variable: '{var2}'")
19+
20+
return val1, val2
21+
22+
def check_env_equals(var1, var2):
23+
val1, val2 = get_env_vars(var1, var2)
24+
if val1 != val2:
25+
raise EnvVarError(f"'{var1}' must equal '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")
26+
27+
def check_env_contains(var1, var2):
28+
val1, val2 = get_env_vars(var1, var2)
29+
if val2 not in val1:
30+
raise EnvVarError(f"'{var1}' must contain '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")
31+
32+
def check_env_endswith(var1, var2):
33+
val1, val2 = get_env_vars(var1, var2)
34+
if not val1.endswith(val2):
35+
raise EnvVarError(f"'{var1}' must end with '{var2}':\n{var1}='{val1}'\n{var2}='{val2}'")
36+
37+
if __name__ == "__main__":
38+
try:
39+
# accelerator stuff is not guaranteed to exist
40+
expected_eessi_accel_arch = os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE", default=None)
41+
42+
# Verify the software and accelerator targets are set correctly
43+
if os.getenv("EESSI_SOFTWARE_SUBDIR_OVERRIDE", default=None):
44+
check_env_equals("EESSI_SOFTWARE_SUBDIR_OVERRIDE", "EESSI_SOFTWARE_SUBDIR")
45+
if expected_eessi_accel_arch:
46+
# EESSI_ACCEL_SUBDIR is what is detected by archdetect (or respects EESSI_ACCELERATOR_TARGET_OVERRIDE)
47+
check_env_equals("EESSI_ACCELERATOR_TARGET_OVERRIDE", "EESSI_ACCEL_SUBDIR")
48+
# special case is where EESSI_ACCELERATOR_TARGET_OVERRIDE may not match the final
49+
# accelerator architecture chosen (in CI we deliberately choose a non-existent CUDA
50+
# compute cabability for one case)
51+
os.environ["EESSI_FINAL_CC"] = expected_eessi_accel_arch[:-1] + "0"
52+
check_env_equals("EESSI_ACCELERATOR_TARGET", "EESSI_FINAL_CC")
53+
# verify the software paths that should exist
54+
check_env_endswith("EESSI_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR")
55+
check_env_endswith("EESSI_SITE_SOFTWARE_PATH", "EESSI_SOFTWARE_SUBDIR")
56+
# verify the module paths that should exist
57+
check_env_contains("EESSI_MODULEPATH", "EESSI_SOFTWARE_SUBDIR")
58+
check_env_contains("EESSI_SITE_MODULEPATH", "EESSI_SOFTWARE_SUBDIR")
59+
if expected_eessi_accel_arch:
60+
check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR")
61+
check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_SOFTWARE_SUBDIR")
62+
check_env_contains("EESSI_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET")
63+
check_env_contains("EESSI_SITE_MODULEPATH_ACCEL", "EESSI_ACCELERATOR_TARGET")
64+
# Finally, verify that all the expected module path are included
65+
check_env_contains("MODULEPATH", "EESSI_MODULEPATH")
66+
check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH")
67+
if expected_eessi_accel_arch:
68+
check_env_contains("MODULEPATH", "EESSI_MODULEPATH_ACCEL")
69+
check_env_contains("MODULEPATH", "EESSI_SITE_MODULEPATH_ACCEL")
70+
71+
# We are done
72+
print("Environment variable check passed.")
73+
except EnvVarError as e:
74+
print(str(e), file=sys.stderr)
75+
sys.exit(1)

.github/workflows/tests_eessi_module.yml

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,9 @@ jobs:
7373
- x86_64/amd/zen3
7474
- x86_64/amd/zen4
7575
EESSI_ACCELERATOR_TARGET_OVERRIDE:
76-
- accel/nvidia/cc80
76+
- accel/nvidia/cc80
77+
# This should fall back to cc70
78+
- accel/nvidia/cc77
7779
steps:
7880
- name: Check out software-layer repository
7981
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -102,21 +104,26 @@ jobs:
102104
# Turn on debug output in case we want to take a look
103105
export EESSI_DEBUG_INIT=true
104106
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
105-
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
106107
module load EESSI/${{matrix.EESSI_VERSION}}
107-
# EESSI_ARCHDETECT_OPTIONS_OVERRIDE/EESSI_DEBUG_INIT only relevant for Lmod init
108-
unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE
108+
# EESSI_DEBUG_INIT/EESSI_ARCHDETECT_OPTIONS only relevant for Lmod init
109109
unset EESSI_DEBUG_INIT
110110
# Store all relevant environment variables
111-
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${moduleoutfile}"
111+
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | grep -v EESSI_ARCHDETECT_OPTIONS | sort > "${moduleoutfile}"
112112
module unload EESSI/${{matrix.EESSI_VERSION}}
113113
114+
# We should only have two EESSI_* variables defined (which set the overrides)
115+
if [ "$(env | grep -c '^EESSI')" -ne 2 ]; then
116+
echo "Expected 2 EESSI-related environment variables, but found a different number."
117+
env | grep '^EESSI'
118+
exit 1
119+
fi
120+
114121
# Now do the init script initialisation
115122
source ./init/bash
116123
# source script version sets environment variables to force archdetect, ignore these
117124
unset EESSI_USE_ARCHSPEC
118125
unset EESSI_USE_ARCHDETECT
119-
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${sourceoutfile}"
126+
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH|^MODULEPATH)' | sort > "${sourceoutfile}"
120127
121128
# Now compare the two results
122129
echo ""
@@ -149,6 +156,8 @@ jobs:
149156
EESSI_ACCELERATOR_TARGET_OVERRIDE:
150157
- none
151158
- accel/nvidia/cc80
159+
# This should fall back to cc70
160+
- accel/nvidia/cc77
152161
steps:
153162
- name: Check out software-layer repository
154163
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -181,7 +190,7 @@ jobs:
181190
initial_env_file="initial_env.txt"
182191
module_cycled_file="load_unload_cycle.txt"
183192
184-
# prepare Lmod, resetting it in a roundabout given we don't want defaults set
193+
# prepare Lmod, resetting it in a roundabout way given we don't want defaults set
185194
export MODULEPATH=init/modules:.github/workflows/modules
186195
module load fake_module
187196
module purge
@@ -205,3 +214,9 @@ jobs:
205214
diff --unified=0 "${initial_env_file}" "${module_cycled_file}"
206215
exit 1
207216
fi
217+
218+
module load EESSI/${{matrix.EESSI_VERSION}}
219+
# Make sure our CPU and GPU architectures are what we expect
220+
# (script uses EESSI_SOFTWARE_SUBDIR_OVERRIDE and EESSI_ACCELERATOR_TARGET_OVERRIDE
221+
# as the starting point for the comparison)
222+
python .github/workflows/scripts/verify_eessi_environment.py

init/bash

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,20 @@ if [ $? -eq 0 ]; then
2626
# prepend location of modules for EESSI software stack to $MODULEPATH
2727
show_msg "Prepending $EESSI_MODULEPATH to \$MODULEPATH..."
2828
module use $EESSI_MODULEPATH
29-
show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..."
30-
module use $EESSI_SITE_MODULEPATH
3129

3230
if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then
3331
show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..."
3432
module use $EESSI_MODULEPATH_ACCEL
3533
fi
3634

35+
show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..."
36+
module use $EESSI_SITE_MODULEPATH
37+
38+
if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then
39+
show_msg "Prepending $EESSI_SITE_MODULEPATH_ACCEL to \$MODULEPATH..."
40+
module use $EESSI_SITE_MODULEPATH_ACCEL
41+
fi
42+
3743
#show_msg ""
3844
#show_msg "*** Known problems in the ${EESSI_VERSION} software stack ***"
3945
#show_msg ""

init/eessi_environment_variables

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,18 @@ if [ -d $EESSI_PREFIX ]; then
6767
EESSI_ACCEL_SOFTWARE_SUBDIR=${EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE:-$EESSI_SOFTWARE_SUBDIR}
6868
# path to where accel/* subdirectory is located
6969
EESSI_ACCEL_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_ACCEL_SOFTWARE_SUBDIR}
70-
if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then
71-
show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCEL_SUBDIR}"
70+
if [ ! -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then
71+
# We should try to use the fallback compute capability
72+
EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR::-1}0"
73+
show_msg "archdetect found no supported accelerator ${EESSI_ACCEL_SUBDIR}, falling back to ${EESSI_ACCELERATOR_TARGET}"
7274
else
73-
show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCEL_SUBDIR})"
75+
EESSI_ACCELERATOR_TARGET="${EESSI_ACCEL_SUBDIR}"
76+
fi
77+
if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCELERATOR_TARGET} ]; then
78+
show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCELERATOR_TARGET}"
79+
export EESSI_ACCELERATOR_TARGET
80+
else
81+
show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCELERATOR_TARGET})"
7482
fi
7583
fi
7684
else
@@ -95,6 +103,7 @@ if [ -d $EESSI_PREFIX ]; then
95103
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
96104
if [ -f $lmod_rc_file ]; then
97105
show_msg "Found Lmod configuration file at $lmod_rc_file"
106+
export LMOD_RC="$lmod_rc_file"
98107
else
99108
error "Lmod configuration file not found at $lmod_rc_file"
100109
fi
@@ -112,6 +121,8 @@ if [ -d $EESSI_PREFIX ]; then
112121
elif [ -d $EESSI_SOFTWARE_PATH ]; then
113122
export EESSI_SITE_SOFTWARE_PATH=${EESSI_SOFTWARE_PATH/versions/host_injections}
114123
show_msg "Using ${EESSI_SITE_SOFTWARE_PATH} as the site extension directory for installations."
124+
EESSI_SITE_ACCEL_SOFTWARE_PATH=${EESSI_ACCEL_SOFTWARE_PATH/versions/host_injections}
125+
show_msg "Using ${EESSI_SITE_ACCEL_SOFTWARE_PATH} as the site extension directory for accelerated installations."
115126
# Allow for use of alternative module tree shipped with EESSI
116127
if [ -z ${EESSI_MODULE_SUBDIR+x} ]; then
117128
# EESSI_MODULE_SUBDIR not set
@@ -137,9 +148,11 @@ if [ -d $EESSI_PREFIX ]; then
137148
false
138149
fi
139150

140-
if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} ]; then
141-
export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR}
151+
if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR} ]; then
152+
export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR}
142153
show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH."
154+
export EESSI_SITE_MODULEPATH_ACCEL=${EESSI_SITE_ACCEL_SOFTWARE_PATH}/${EESSI_ACCELERATOR_TARGET}/${EESSI_MODULE_SUBDIR}
155+
show_msg "Using ${EESSI_SITE_MODULEPATH_ACCEL} as additional site extension directory (for accelerators) to be added to MODULEPATH."
143156
fi
144157

145158
# Fix wrong path for RHEL >=8 libcurl

init/modules/EESSI/2023.06.lua

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,14 @@ function archdetect_accel()
6363
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper_accel.sh')
6464
-- for unload mode, we need to grab the value before it is unset
6565
local archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or (os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") or "")
66-
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE ") then
66+
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") then
6767
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
6868
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ACCELERATOR_TARGET_OVERRIDE to the available accelerator architecture in the form of: accel/nvidia/cc80")
6969
end
70+
-- this script sets EESSI_ACCEL_SUBDIR
7071
source_sh("bash", script)
72+
else
73+
setenv("EESSI_ACCEL_SUBDIR", os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE"))
7174
end
7275
archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or archdetect_accel
7376
eessiDebug("Got archdetect accel option: " .. archdetect_accel)
@@ -140,16 +143,33 @@ if not (archdetect_accel == nil or archdetect_accel == '') then
140143
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all
141144
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
142145
eessiDebug("Checking if " .. eessi_module_path_accel .. " exists")
146+
if not isDir(eessi_module_path_accel) then
147+
-- fall back to major version GPU arch if the exact one is not an option (i.e, 7.5 -> 7.0)
148+
local original_archdetect_accel = archdetect_accel
149+
archdetect_accel = archdetect_accel:sub(1,-2) .. "0"
150+
eessiDebug("No directory for " .. original_archdetect_accel .. ", trying " .. archdetect_accel)
151+
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
152+
end
143153
if isDir(eessi_module_path_accel) then
154+
-- set the accelerator target based on what actually exists
155+
setenv("EESSI_ACCELERATOR_TARGET", archdetect_accel)
144156
setenv("EESSI_MODULEPATH_ACCEL", eessi_module_path_accel)
145-
prepend_path("MODULEPATH", eessi_module_path_accel)
146-
eessiDebug("Using acclerator modules at: " .. eessi_module_path_accel)
157+
if ( mode() ~= "spider" ) then
158+
prepend_path("MODULEPATH", eessi_module_path_accel)
159+
eessiDebug("Using accelerator modules at: " .. eessi_module_path_accel)
160+
end
147161
end
148162
end
149163

150164
-- prepend the site module path last so it has priority
151165
prepend_path("MODULEPATH", eessi_site_module_path)
152166
eessiDebug("Adding " .. eessi_site_module_path .. " to MODULEPATH")
167+
if isDir(eessi_module_path_accel) then
168+
eessi_module_path_site_accel = string.gsub(eessi_module_path_accel, "versions", "host_injections")
169+
setenv("EESSI_SITE_MODULEPATH_ACCEL", eessi_module_path_site_accel)
170+
prepend_path("MODULEPATH", eessi_module_path_site_accel)
171+
eessiDebug("Using site accelerator modules at: " .. eessi_module_path_site_accel)
172+
end
153173
if mode() == "load" then
154174
LmodMessage("EESSI/" .. eessi_version .. " loaded successfully")
155175
end

0 commit comments

Comments
 (0)