From b4104cd1021fc928a72235d965afeab1803b4b43 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 28 Aug 2025 16:57:35 +0200 Subject: [PATCH 1/2] Rebuild UCX UCC and OSU for the CUDA sanity check --- ...5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml new file mode 100644 index 0000000000..99529b551d --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml @@ -0,0 +1,10 @@ +# We'll rebuild all CUDA software, for various reasons +# 1. We now have a proper CUDA sanity check, and if anything was 'wrong' with our current CUDA installs, we'd like +# to know about it +# 2. The PR implementing a CI to check for differences between officially supported CUDA Compute Capabilities shows +# that there are a lot of missing installations https://github.com/EESSI/software-layer/pull/1087 . A rebuild PR like +# this will have the convenient side effect of filling all those holes +easyconfigs: + - UCX-CUDA-1.15.0-GCCcore-13.2.0-CUDA-12.4.0.eb + - UCC-CUDA-1.2.0-GCCcore-13.2.0-CUDA-12.4.0.eb + - OSU-Micro-Benchmarks-7.5-gompi-2023b-CUDA-12.4.0.eb From fca2471888e36b8468561b4da362d309633c4265 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 29 Aug 2025 21:25:02 +0200 Subject: [PATCH 2/2] Add NCCL to this PR, as it needs to be done in after UCX but before UCC-CUDA --- ...0828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml index 99529b551d..4c567cbb62 100644 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250828-eb-5.1.1-rebuild-UCX-UCC-OSU-for-cuda-sanity-check.yml @@ -6,5 +6,8 @@ # this will have the convenient side effect of filling all those holes easyconfigs: - UCX-CUDA-1.15.0-GCCcore-13.2.0-CUDA-12.4.0.eb + - NCCL-2.20.5-GCCcore-13.2.0-CUDA-12.4.0.eb: + options: + cuda-sanity-check-accept-missing-ptx: True - UCC-CUDA-1.2.0-GCCcore-13.2.0-CUDA-12.4.0.eb - OSU-Micro-Benchmarks-7.5-gompi-2023b-CUDA-12.4.0.eb