Skip to content
9 changes: 6 additions & 3 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ pr_diff=$(ls [0-9]*.diff | head -1)
# for now, this just reinstalls all scripts. Note the most elegant, but works
${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}

# Install full CUDA SDK in host_injections
# Install full CUDA SDK and cu* libraries in host_injections
# Hardcode this for now, see if it works
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
Expand All @@ -233,9 +233,12 @@ else
fi

if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \
-e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \
-t /tmp/temp \
--accept-cuda-eula
else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi

# Install drivers in host_injections
Expand Down
38 changes: 22 additions & 16 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,35 +107,41 @@
end


local function eessi_cuda_enabled_load_hook(t)
local function eessi_cuda_and_libraries_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
local packagesList = { ["CUDA"] = true, ["cuDNN"] = true }
-- If we try to load any of the modules in packagesList, we check if the
-- full package was installed on the host in host_injections.
-- This is required for end users to build additional software that depends
-- on the package. If the full SDK isn't present, refuse
-- to load the module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'CUDA' then
if packagesList[simpleName] then
-- simpleName is a module in packagesList
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then

-- build final path where the software should be installed
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local packageDirExists = isDir(packageEasyBuildDir)
if not packageDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- when loading CUDA (and cu*) enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cvmfs_repo = os.getenv("EESSI_CVMFS_REPO") or ""
local cudaVersionFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
Expand Down Expand Up @@ -175,10 +181,10 @@
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack
-- Only apply CUDA and cuDNN hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA and cuDNN module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cuda_and_libraries_enabled_load_hook(t)
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ easyconfigs:
options:
from-pr: 20299
- EESSI-extend-2023.06-easybuild.eb
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
157 changes: 117 additions & 40 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,47 @@ def post_sanitycheck_hook(self, *args, **kwargs):
POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs)


def replace_non_distributable_files_with_symlinks(log, install_dir, package, allowlist):
"""
Replace files that cannot be distributed with symlinks into host_injections
"""
extension_based = { "CUDA": False, "cuDNN": True }
if not package in extension_based:
raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package)

# iterate over all files in the package installation directory
for dir_path, _, files in os.walk(install_dir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if extension_based[package]:
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif extension_based[package] and '.' in filename and extension in allowlist:
log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
if extension_based[package]:
print_name = filename
else:
print_name = basename
log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
print_name, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)


def post_sanitycheck_cuda(self, *args, **kwargs):
"""
Remove files from CUDA installation that we are not allowed to ship,
Expand Down Expand Up @@ -606,56 +647,91 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
if 'libcudart' not in allowlist:
raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist)

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
basename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")



def post_sanitycheck_cudnn(self, *args, **kwargs):
"""
Remove files from cuDNN installation that we are not allowed to ship,
and replace them with a symlink to a corresponding installation under host_injections.
"""
if self.name == 'cuDNN':
print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...")

allowlist = ['LICENSE']

# read cuDNN LICENSE, construct allowlist based on section "2. Distribution" that specifies list of files that can be shipped
license_path = os.path.join(self.installdir, 'LICENSE')
search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:"
with open(license_path) as infile:
for line in infile:
if line.strip().startswith(search_string):
# remove search string, split into words, remove trailing
# dots '.' and only retain words starting with a dot '.'
distributable = line[len(search_string):]
for word in distributable.split():
if word[0] == '.':
allowlist.append(word.rstrip('.'))

allowlist = sorted(set(allowlist))
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))

# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")


def inject_gpu_property(ec):
"""
Add 'gpu' property, via modluafooter easyconfig parameter
Add 'gpu' property EESSI<PACKAGE>VERSION envvars and drop dependencies to
build dependencies, via modluafooter easyconfig parameter
"""
ec_dict = ec.asdict()
# Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property
if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version")
# check if CUDA, cuDNN, you-name-it is in the dependencies, if so
# - drop dependency to build dependency
# - add 'gpu' Lmod property
# - add envvar with package version
packages_list = ( "CUDA", "cuDNN" )
packages_version = { }
add_gpu_property = ''

for package in packages_list:
# Check if package is in the dependencies, if so drop dependency to build
# dependency and set variable for later adding the 'gpu' Lmod property
if (package in [dep[0] for dep in iter(ec_dict['dependencies'])]):
add_gpu_property = 'add_property("arch","gpu")'
for dep in iter(ec_dict['dependencies']):
if package in dep[0]:
# make package a build dependency only (rpathing saves us from link errors)
ec.log.info("Dropping dependency on %s to build dependency" % package)
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
# take note of version for creating the modluafooter
packages_version[package] = dep[1]
if add_gpu_property:
ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version")
key = 'modluafooter'
value = 'add_property("arch","gpu")'
cuda_version = 0
for dep in iter(ec_dict['dependencies']):
# Make CUDA a build dependency only (rpathing saves us from link errors)
if 'CUDA' in dep[0]:
cuda_version = dep[1]
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = '\n'.join([ec_dict[key], value])
values = [add_gpu_property]
for package, version in packages_version.items():
envvar = "EESSI%sVERSION" % package.upper()
values.append('setenv("%s","%s")' % (envvar, version))
if not key in ec_dict:
ec[key] = '\n'.join(values)
else:
ec[key] = value
new_value = ec_dict[key]
for value in values:
if not value in new_value:
new_value = '\n'.join([new_value, value])
ec[key] = new_value

return ec


Expand Down Expand Up @@ -709,4 +785,5 @@ def inject_gpu_property(ec):

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
'cuDNN': post_sanitycheck_cudnn,
}
5 changes: 4 additions & 1 deletion install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@

# Copy files for the scripts/gpu_support/nvidia directory
nvidia_files=(
install_cuda_host_injections.sh link_nvidia_host_libraries.sh
eessi-2023.06-cuda-and-libraries.yml
install_cuda_and_libraries.sh
install_cuda_host_injections.sh
link_nvidia_host_libraries.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
easyconfigs:
- CUDA-12.1.1.eb
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
Loading