diff --git a/.github/workflows/scripts/compare_to_generic.sh b/.github/workflows/scripts/compare_to_generic.sh index 59a1397ec5..8f046ec9d4 100755 --- a/.github/workflows/scripts/compare_to_generic.sh +++ b/.github/workflows/scripts/compare_to_generic.sh @@ -23,4 +23,32 @@ esac source_of_truth_modules="$base_dir/$source_of_truth/$modules_subdir" arch_modules="$base_dir/$target_arch/$modules_subdir" echo "Comparing $arch_modules to $source_of_truth_modules" -python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules + +if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then + echo "Warning: Comparison failed for CPU stacks" >&2 + exit 1 +fi + +# Also compare NVIDIA GPU software stacks +if [[ -n "$CUDA_COMPUTE_CAPABILITIES" ]]; then + read -ra compute_capabilities <<< "$CUDA_COMPUTE_CAPABILITIES" + echo "Also comparing CUDA-enabled software stacks (for compute capabilities: ${compute_capabilities[@]})" + # Initialize a variable to track failures + any_failure=0 + # Loop over the array + for cc in "${compute_capabilities[@]}"; do + source_of_truth_modules="$base_dir/$source_of_truth/accel/nvidia/cc80/$modules_subdir" + arch_modules="$base_dir/$target_arch/accel/nvidia/$cc/$modules_subdir" + echo "Comparing $arch_modules to $source_of_truth_modules" + if ! python3 $script_dir/compare_stacks.py $source_of_truth_modules $arch_modules; then + echo "Warning: Comparison failed for compute capability $cc" >&2 + any_failure=1 + fi + done + if [[ $any_failure -ne 0 ]]; then + echo "One or more CUDA software stack comparisons failed." >&2 + exit 1 + fi +else + echo "CUDA_COMPUTE_CAPABILITIES is not set or is empty, not checking NVIDIA software stacks" +fi diff --git a/.github/workflows/test_compare_stacks.yml b/.github/workflows/test_compare_stacks.yml index 00c4af099a..f03929349b 100644 --- a/.github/workflows/test_compare_stacks.yml +++ b/.github/workflows/test_compare_stacks.yml @@ -8,11 +8,17 @@ on: permissions: contents: read # to fetch code (actions/checkout) env: - EESSI_ACCELERATOR_TARGETS: | + CUDA_COMPUTE_CAPABILITIES_YAML: | + # Provide a default set of compute capabilities + default: + - cc70 + - cc80 + - cc90 + # and then allow for special cases for specific architectures x86_64/amd/zen2: - - nvidia/cc80 - x86_64/amd/zen3: - - nvidia/cc80 + - cc70 + - cc80 + - cc90 jobs: compare_stacks: runs-on: ubuntu-24.04 @@ -53,4 +59,11 @@ jobs: # Compare the requested architecture to the generic stack # (assumes the general structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/modules/all) + # and include a check for CUDA-enabled software using the environment variable CUDA_COMPUTE_CAPABILITIES + # (which assumes the structure /cvmfs/software.eessi.io/versions/2023.06/software/linux/$COMPARISON_ARCH/accel/nvidia/$cc/modules/all) + + # Parse the yaml that makes the compute capabilities arch-dependent + CUDA_COMPUTE_CAPABILITIES=$(echo "${CUDA_COMPUTE_CAPABILITIES_YAML}" | yq ".\"${{matrix.COMPARISON_ARCH}}\" // .default | .[]" | tr '\n' ' ') + export CUDA_COMPUTE_CAPABILITIES=${CUDA_COMPUTE_CAPABILITIES%% } # trim trailing space + .github/workflows/scripts/compare_to_generic.sh ${EESSI_PREFIX}/software/${EESSI_OS_TYPE} ${{matrix.COMPARISON_ARCH}} diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250926-eb-5.1.1-rebuild-lightGBM-for-cuda-sanity-check.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250926-eb-5.1.1-rebuild-lightGBM-for-cuda-sanity-check.yml new file mode 100644 index 0000000000..8d75ed7190 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250926-eb-5.1.1-rebuild-lightGBM-for-cuda-sanity-check.yml @@ -0,0 +1,12 @@ +# We'll rebuild all CUDA software, for various reasons +# 1. We now have a proper CUDA sanity check, and if anything was 'wrong' with our current CUDA installs, we'd like +# to know about it +# 2. The PR implementing a CI to check for differences between officially supported CUDA Compute Capabilities shows +# that there are a lot of missing installations https://github.com/EESSI/software-layer/pull/1087 . A rebuild PR like +# this will have the convenient side effect of filling all those holes +easyconfigs: + - LightGBM-4.5.0-foss-2023a-CUDA-12.1.1.eb: + options: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/24023 + from-commit: 853cdf7a8a3912aa0e55367b2b4451ebff00e13b + cuda-sanity-check-accept-missing-ptx: True