File tree Expand file tree Collapse file tree 1 file changed +21
-0
lines changed Expand file tree Collapse file tree 1 file changed +21
-0
lines changed Original file line number Diff line number Diff line change 9494 end
9595end
9696
97+ local function openmpi_load_hook(t)
98+ -- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
99+ -- to work around hang/crash due to bug in OpenMPI;
100+ -- see https://gitlab.com/eessi/support/-/issues/41
101+ local frameStk = require("FrameStk"):singleton()
102+ local mt = frameStk:mt()
103+ local moduleName = string.match(t.modFullName, "(.-)/")
104+ local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
105+ if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
106+ local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
107+ LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
108+ local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
109+ if ompiMcaBtl == nil then
110+ setenv("OMPI_MCA_btl", "^smcuda")
111+ else
112+ setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
113+ end
114+ end
115+ end
116+
97117hook.register("load", cuda_enabled_load_hook)
118+ hook.register("load", openmpi_load_hook)
98119"""
99120
100121def error (msg ):
You can’t perform that action at this time.
0 commit comments