|
35 | 35 | local simpleName = string.match(t.modFullName, "(.-)/") |
36 | 36 | -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections. |
37 | 37 | -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse |
38 | | - -- to load the CUDA module and print an informative message on how to set up GPU support for EESSI |
| 38 | + -- to load the CUDA module and print an informative message on how to set up GPU support for NESSI |
39 | 39 | local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" |
40 | 40 | if simpleName == 'CUDA' then |
41 | 41 | -- get the full host_injections path |
|
44 | 44 | local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" |
45 | 45 | local cudaDirExists = isDir(cudaEasyBuildDir) |
46 | 46 | if not cudaDirExists then |
47 | | - local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " |
48 | | - advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI " |
| 47 | + local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI " |
| 48 | + advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI " |
49 | 49 | advice = advice .. "can find it.\\n" |
50 | 50 | advice = advice .. refer_to_docs |
51 | 51 | LmodError("\\nYou requested to load ", simpleName, " ", advice) |
52 | 52 | end |
53 | 53 | end |
54 | | - -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker, |
| 54 | + -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker, |
55 | 55 | -- otherwise, refuse to load the requested module and print error message |
56 | 56 | local haveGpu = mt:haveProperty(simpleName,"arch","gpu") |
57 | 57 | if haveGpu then |
58 | 58 | local arch = os.getenv("EESSI_CPU_FAMILY") or "" |
59 | | - local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" |
60 | | - local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" |
| 59 | + local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" |
| 60 | + local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" |
61 | 61 | local cudaDriverExists = isFile(cudaDriverFile) |
62 | 62 | local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") |
63 | 63 | if not (cudaDriverExists or singularityCudaExists) then |
64 | 64 | local advice = "which relies on the CUDA runtime environment and driver libraries. " |
65 | 65 | advice = advice .. "In order to be able to use the module, you will need " |
66 | | - advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n" |
| 66 | + advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n" |
67 | 67 | advice = advice .. refer_to_docs |
68 | 68 | LmodError("\\nYou requested to load ", simpleName, " ", advice) |
69 | 69 | else |
|
85 | 85 | if driver_libs_need_update == true then |
86 | 86 | local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " |
87 | 87 | advice = advice .. "Please update your CUDA driver libraries and then " |
88 | | - advice = advice .. "let EESSI know about the update.\\n" |
| 88 | + advice = advice .. "let NESSI know about the update.\\n" |
89 | 89 | advice = advice .. refer_to_docs |
90 | 90 | LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) |
91 | 91 | end |
|
94 | 94 | end |
95 | 95 | end |
96 | 96 |
|
| 97 | +local function openmpi_load_hook(t) |
| 98 | + -- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1, |
| 99 | + -- to work around hang/crash due to bug in OpenMPI; |
| 100 | + -- see https://gitlab.com/eessi/support/-/issues/41 |
| 101 | + local frameStk = require("FrameStk"):singleton() |
| 102 | + local mt = frameStk:mt() |
| 103 | + local moduleName = string.match(t.modFullName, "(.-)/") |
| 104 | + local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or "" |
| 105 | + if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then |
| 106 | + local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI" |
| 107 | + LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)") |
| 108 | + local ompiMcaBtl = os.getenv("OMPI_MCA_btl") |
| 109 | + if ompiMcaBtl == nil then |
| 110 | + setenv("OMPI_MCA_btl", "^smcuda") |
| 111 | + else |
| 112 | + setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda") |
| 113 | + end |
| 114 | + end |
| 115 | +end |
| 116 | +
|
97 | 117 | hook.register("load", cuda_enabled_load_hook) |
| 118 | +hook.register("load", openmpi_load_hook) |
98 | 119 | """ |
99 | 120 |
|
100 | 121 | def error(msg): |
|
0 commit comments