Skip to content

Commit 99c82b5

Browse files
authored
Merge pull request #59 from casparvl/cuda_in_major_arch_only
Adapt subdir for CUDA toolkit in host injections
2 parents f50b063 + 7a1e4c1 commit 99c82b5

File tree

7 files changed

+111
-224
lines changed

7 files changed

+111
-224
lines changed

EESSI-install-software.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,29 @@ else
150150
# make sure the the software and modules directory exist
151151
# (since it's expected by init/eessi_environment_variables when using archdetect and by the EESSI module)
152152
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/{modules,software}
153+
154+
# If EESSI_ACCELERATOR_TARGET_OVERRIDE is defined, we are building for an accelerator target
155+
# In that case, make sure the modulepath for the accelerator subdir exists, otherwise the EESSI module will not
156+
# set EESSI_ACCELERATOR_TARGET and the if-condition later in this script which checks if EESSI_ACCELERATOR_TARGET
157+
# is equal to EESSI_ACCELERATOR_TARGET_OVERRIDE will fail
158+
# See https://github.com/EESSI/software-layer-scripts/pull/59#issuecomment-3173593882
159+
if [ -n $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
160+
# Note that ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all
161+
# is only the correct path if EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE is not set
162+
if [ -z $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE ]; then
163+
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all
164+
else
165+
# At runtime, one might want to use a different CPU subdir for a given accelerator. E.g. one could use
166+
# a zen2 CPU subdir on a zen4 node if the required GPU software isn't available in the zen4 tree.
167+
# At build time, this doesn't make a lot of sense: we'd probably build in a CPU prefix that is different
168+
# from what the code will be optimized for, and we wouldn't want that
169+
# So this message _should_ never be printed...
170+
msg="When building the software subdirectory for the CPU should almost certainly be that of the host."
171+
msg="$msg If you think this is incorrect, please implement behaviour that makes sense in "
172+
msg="$msg EESSI-software-installation.sh, essentially replacing this error."
173+
fatal_error "$msg"
174+
fi
175+
fi
153176
)
154177
fi
155178

create_lmodsitepackage.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,31 @@
123123
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/site_specific_config/gpu/.\\n"
124124
if packagesList[simpleName] then
125125
-- simpleName is a module in packagesList
126-
-- get the full host_injections path
127-
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
126+
-- first, check the old host_injections path prior to https://github.com/EESSI/software-layer-scripts/pull/59
127+
-- If that exists, print a more targetted, explanatory warning
128+
local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
129+
local previousPackageEasyBuildDir = previousHostInjections .. "/software/" .. t.modFullName .. "/easybuild"
130+
local previousPackageDirExists = isDir(previousPackageEasyBuildDir)
131+
132+
-- get the host_injections path, and add only the EESSI_CPU_FAMILY at the end
133+
local strip_suffix = os.getenv('EESSI_VERSION') .. "/software/" .. os.getenv('EESSI_OS_TYPE') .. "/"
134+
strip_suffix = strip_suffix .. os.getenv('EESSI_SOFTWARE_SUBDIR')
135+
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", strip_suffix, os.getenv('EESSI_CPU_FAMILY'))
128136
129137
-- build final path where the software should be installed
130138
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
131139
local packageDirExists = isDir(packageEasyBuildDir)
132-
if not packageDirExists then
140+
if previousPackageDirExists and not packageDirExists then
141+
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
142+
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
143+
advice = advice .. "can find it.\\n"
144+
advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". "
145+
advice = advice .. "However, EESSI expects it in a different location since Aug'25, namely at "
146+
advice = advice .. hostInjections .. "/software/" .. t.modFullName .. ". "
147+
advice = advice .. "Please re-install the package at the new location. "
148+
advice = advice .. refer_to_docs
149+
LmodError("\\nYou requested to load ", simpleName, " ", advice)
150+
elseif not packageDirExists then
133151
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
134152
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
135153
advice = advice .. "can find it.\\n"
@@ -293,7 +311,7 @@ def error(msg):
293311
# the install path (if it exists)
294312
accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET")
295313
if accel_subdir:
296-
sitepackage_path = sitepackage_path.replace("/accel/%s" % accel_subdir, '')
314+
sitepackage_path = sitepackage_path.replace("/%s" % accel_subdir, '')
297315
try:
298316
os.makedirs(os.path.dirname(sitepackage_path), exist_ok=True)
299317
with open(sitepackage_path, 'w') as fp:

easystacks/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
WARNING: in principle _all_ easystack files should go into EESSI/software-layer, not in EESSI/software-layer-scripts. Easystack files are only added in EESSI/software-layer-scripts by exception, for example when the (re)deployment of the software has to be done synchronously with a change in EESSI/software-layer-scripts.
2+
3+
Here, we list past deployments for which this was the case (and why):
4+
5+
[PR#59](https://github.com/EESSI/software-layer-scripts/pull/59): modified the prefix in which `install_cuda_and_libraries.sh` installs the CUDA toolkit within `host_injections`. Also, updated the Lmod SitePackage.lua to print an informative message in case the CUDA Toolkit is found in the old location. This requires synchronous deployment of new CUDA and cuDNN installations in the software layer, because the symlinks from these installations should be redirected to the new prefix in `host_injections`.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# In https://github.com/EESSI/software-layer-scripts/pull/59 we introduced a new location for
2+
# installing the CUDA toolkit within the host_injections directory. This requires reinstallation
3+
# of CUDA and cuDNN to make sure all symlinks point to these new locations
4+
easyconfigs:
5+
- CUDA-12.1.1.eb:
6+
options:
7+
accept-eula-for: CUDA
8+
- CUDA-12.4.0.eb:
9+
options:
10+
accept-eula-for: CUDA
11+
- cuDNN-8.9.2.26-CUDA-12.1.1.eb:
12+
options:
13+
accept-eula-for: cuDNN

eb_hooks.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def parse_list_of_dicts_env(var_name):
151151
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', var_name):
152152
raise ValueError(f"Invalid environment variable name: {var_name}")
153153
list_string = os.getenv(var_name, '[]')
154-
154+
155155
list_of_dicts = []
156156
try:
157157
# Try JSON format first
@@ -162,7 +162,7 @@ def parse_list_of_dicts_env(var_name):
162162
list_of_dicts = ast.literal_eval(list_string)
163163
except (ValueError, SyntaxError):
164164
raise ValueError(f"Environment variable '{var_name}' does not contain a valid list of dictionaries.")
165-
165+
166166
return list_of_dicts
167167

168168

@@ -211,7 +211,7 @@ def post_ready_hook(self, *args, **kwargs):
211211
parallel = self.parallel
212212
else:
213213
parallel = self.cfg['parallel']
214-
214+
215215
if parallel == 1:
216216
return # no need to limit if already using 1 core
217217

@@ -733,7 +733,7 @@ def pre_configure_hook_score_p(self, *args, **kwargs):
733733
def pre_configure_hook_vsearch(self, *args, **kwargs):
734734
"""
735735
Pre-configure hook for VSEARCH
736-
- Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179
736+
- Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179
737737
(solves "expected initializer before 'OF'" errors)
738738
"""
739739
if self.name == 'VSEARCH':
@@ -1199,7 +1199,7 @@ def post_postproc_cuda(self, *args, **kwargs):
11991199

12001200
# replace files that are not distributable with symlinks into
12011201
# host_injections
1202-
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
1202+
replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
12031203
else:
12041204
print_msg(f"EESSI hook to respect CUDA license not triggered for installation path {self.installdir}")
12051205
else:
@@ -1249,16 +1249,19 @@ def post_postproc_cudnn(self, *args, **kwargs):
12491249

12501250
# replace files that are not distributable with symlinks into
12511251
# host_injections
1252-
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
1252+
replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
12531253
else:
12541254
print_msg(f"EESSI hook to respect cuDDN license not triggered for installation path {self.installdir}")
12551255
else:
12561256
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")
12571257

12581258

1259-
def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist):
1259+
def replace_binary_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist):
12601260
"""
12611261
Replace files that cannot be distributed with symlinks into host_injections
1262+
Since these are binary files, only the CPU family will be included in the prefix,
1263+
no microarchitecture or accelerator architecture will be included. For example,
1264+
/cvmfs/software.eessi.io/host_injections/x86_64/suffix/to/actual/file
12621265
"""
12631266
# Different packages use different ways to specify which files or file
12641267
# 'types' may be redistributed. For CUDA, the 'EULA.txt' lists full file
@@ -1301,13 +1304,37 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al
13011304
log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
13021305
print_name, full_path)
13031306
# the host_injections path is under a fixed repo/location for CUDA or cuDNN
1307+
# full_path is something similar to
1308+
# /cvmfs/software.eessi.io/version/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc
1309+
# host_inj_path will then be
1310+
# /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc
13041311
host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path)
13051312
# CUDA and cu* libraries themselves don't care about compute capability so remove this
13061313
# duplication from under host_injections (symlink to a single CUDA or cu* library
13071314
# installation for all compute capabilities)
13081315
accel_subdir = get_eessi_envvar("EESSI_ACCELERATOR_TARGET")
1316+
# If accel_subdir is defined, remove it from the full path
1317+
# After removal of accel_subdir, host_inj_path will be something like
1318+
# /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc
13091319
if accel_subdir:
1310-
host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '')
1320+
host_inj_path = host_inj_path.replace(accel_subdir, '')
1321+
software_subdir = get_eessi_envvar("EESSI_SOFTWARE_SUBDIR")
1322+
cpu_family = get_eessi_envvar("EESSI_CPU_FAMILY")
1323+
os_type = get_eessi_envvar("EESSI_OS_TYPE")
1324+
eessi_version = get_eessi_envvar("EESSI_VERSION")
1325+
if software_subdir and cpu_family and os_type and eessi_version:
1326+
# Compose the string to be removed:
1327+
partial_path = f"{eessi_version}/software/{os_type}/{software_subdir}"
1328+
# After this, host_inj_path will be e.g.
1329+
# /cvmfs/software.eessi.io/host_injections/x86_64/software/CUDA/bin/nvcc
1330+
host_inj_path = host_inj_path.replace(partial_path, cpu_family)
1331+
else:
1332+
msg = "Failed to construct path to symlink for file (%s). All of the following values "
1333+
msg += "have to be defined: EESSI_SOFTWARE_SUBDIR='%s', EESSI_CPU_FAMILY='%s', "
1334+
msg += "EESSI_OS_TYPE='%s', EESSI_VERSION='%s'. Failed to replace non-redistributable file "
1335+
msg += "with symlink, aborting..."
1336+
raise EasyBuildError(msg, full_path, software_subdir, cpu_family, os_type, eessi_version)
1337+
13111338
# make sure source and target of symlink are not the same
13121339
if full_path == host_inj_path:
13131340
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "

scripts/gpu_support/nvidia/install_cuda_and_libraries.sh

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,16 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do
132132

133133
# If there is a GPU on the node, the installation path will by default have an
134134
# accelerator subdirectory. For CUDA and cu*, these are binary installations and
135-
# don't care about the target compute capability. Our hooks are aware of this and
136-
# therefore expect CUDA to be available under EESSI_SITE_SOFTWARE_PATH
137-
export EASYBUILD_INSTALLPATH=$EESSI_SITE_SOFTWARE_PATH
135+
# we don't care about the target compute capability nor the CPU microarchitecture.
136+
# Our hooks are aware of this and therefore expect CUDA to be available under
137+
# something like EESSI_SITE_SOFTWARE_PATH, but then with the CPU micro-architecture
138+
# stripped
139+
# This sed command will capture everything from the EESSI_SITE_SOFTWARE_PATH up until
140+
# the EESSI_VERSION in a capture group. It will the replace that with the content
141+
# of the capture group and then have the EESSI_CPU_FAMILY appended
142+
# Thus EESSI_SITE_CPU_FAMILY_PATH is then something like /cvmfs/software.eessi.io/host_injections/x86_64
143+
EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_VERSION"/software/"$EESSI_OS_TYPE"/"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|')
144+
export EASYBUILD_INSTALLPATH=$EESSI_SITE_CPU_FAMILY_PATH
138145

139146
# Install modules in hidden .modules dir to keep track of what was installed before
140147
# (this action is temporary, and we do not call Lmod again within the current shell context, but in EasyBuild
@@ -258,7 +265,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do
258265
cp -a ${eb_last_log} .
259266
fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..."
260267
else
261-
echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!"
268+
echo_green "all installations at ${EASYBUILD_INSTALLPATH}/software/... succeeded!"
262269
fi
263270

264271
# clean up tmpdir content

0 commit comments

Comments
 (0)