Skip to content

Commit 0d03852

Browse files
committed
add Lmod hook to set $OMPI_MCA_btl to '^smcuda' when loading OpenMPI module
1 parent 1045ef0 commit 0d03852

File tree

1 file changed

+21
-0
lines changed

1 file changed

+21
-0
lines changed

create_lmodrc.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,28 @@
9494
end
9595
end
9696
97+
local function openmpi_load_hook(t)
98+
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
99+
-- to work around hang/crash due to bug in OpenMPI;
100+
-- see https://gitlab.com/eessi/support/-/issues/41
101+
local frameStk = require("FrameStk"):singleton()
102+
local mt = frameStk:mt()
103+
local moduleName = string.match(t.modFullName, "(.-)/")
104+
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
105+
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
106+
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
107+
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
108+
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
109+
if ompiMcaBtl == nil then
110+
setenv("OMPI_MCA_btl", "^smcuda")
111+
else
112+
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
113+
end
114+
end
115+
end
116+
97117
hook.register("load", cuda_enabled_load_hook)
118+
hook.register("load", openmpi_load_hook)
98119
"""
99120

100121
def error(msg):

0 commit comments

Comments
 (0)