[Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr #154558

RKSimon · 2025-08-20T15:28:37Z

Now that #152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr

Now that llvm#152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr

llvmbot · 2025-08-20T15:29:20Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Now that #152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr

Fixes #154555

Patch is 31.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154558.diff

4 Files Affected:

(modified) clang/lib/Headers/fma4intrin.h (+26-16)
(modified) clang/lib/Headers/fmaintrin.h (+26-16)
(modified) clang/test/CodeGen/X86/fma-builtins.c (+17)
(modified) clang/test/CodeGen/X86/fma4-builtins.c (+17)

diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
index 69977fb2ccee3..05261f6389274 100644
--- a/clang/lib/Headers/fma4intrin.h
+++ b/clang/lib/Headers/fma4intrin.h
@@ -20,14 +20,22 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
                                            (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -46,14 +54,14 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
   return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
                                            -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -72,14 +80,14 @@ _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
   return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
                                            (__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -98,14 +106,14 @@ _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
   return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
                                            -(__v4sf)__C);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR 
 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -148,56 +156,56 @@ _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
                                            (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
                                             (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
                                            -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
                                             -(__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
                                            (__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
                                             (__v4df)__C);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
                                            -(__v8sf)__C);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR 
 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
@@ -230,5 +238,7 @@ _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif /* __FMA4INTRIN_H */
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index 24584a928da4d..d8ea489022b8f 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -18,6 +18,14 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 /// Computes a multiply-add of 128-bit vectors of [4 x float].
 ///    For each element, computes <c> (__A * __B) + __C </c>.
 ///
@@ -32,7 +40,7 @@
 /// \param __C
 ///    A 128-bit vector of [4 x float] containing the addend.
 /// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
@@ -53,7 +61,7 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 /// \param __C
 ///    A 128-bit vector of [2 x double] containing the addend.
 /// \returns A 128-bit [2 x double] vector containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -132,7 +140,7 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 /// \param __C
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
@@ -153,7 +161,7 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 /// \param __C
 ///    A 128-bit vector of [2 x double] containing the addend.
 /// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -232,7 +240,7 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 /// \param __C
 ///    A 128-bit vector of [4 x float] containing the addend.
 /// \returns A 128-bit [4 x float] vector containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
@@ -253,7 +261,7 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 /// \param __C
 ///    A 128-bit vector of [2 x double] containing the addend.
 /// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -332,7 +340,7 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 /// \param __C
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
@@ -353,7 +361,7 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 /// \param __C
 ///    A 128-bit vector of [2 x double] containing the subtrahend.
 /// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -536,7 +544,7 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 /// \param __C
 ///    A 256-bit vector of [8 x float] containing the addend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
@@ -557,7 +565,7 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 /// \param __C
 ///    A 256-bit vector of [4 x double] containing the addend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
@@ -578,7 +586,7 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 /// \param __C
 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
@@ -599,7 +607,7 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 /// \param __C
 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
@@ -620,7 +628,7 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 /// \param __C
 ///    A 256-bit vector of [8 x float] containing the addend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
@@ -641,7 +649,7 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 /// \param __C
 ///    A 256-bit vector of [4 x double] containing the addend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
@@ -662,7 +670,7 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 /// \param __C
 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
@@ -683,7 +691,7 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 /// \param __C
 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
@@ -808,5 +816,7 @@ _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif /* __FMAINTRIN_H */
diff --git a/clang/test/CodeGen/X86/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c
index aa17dcc62fbc0..8e9822ec6ad2f 100644
--- a/clang/test/CodeGen/X86/fma-builtins.c
+++ b/clang/test/CodeGen/X86/fma-builtins.c
@@ -5,18 +5,21 @@
 
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m128 test_mm_fmadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmadd_ps
   // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fmadd_ps(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fmadd_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), -0.0f, 0.0f, -2.0f, -3.0f));
 
 __m128d test_mm_fmadd_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmadd_pd
   // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fmadd_pd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), -0.0, -3.0));
 
 __m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmadd_ss
@@ -44,6 +47,7 @@ __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fmsub_ps(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fmsub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 8.0f, -6.0f, -5.0f));
 
 __m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmsub_pd
@@ -51,6 +55,7 @@ __m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fmsub_pd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, -5.0));
 
 __m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsub_ss
@@ -80,6 +85,7 @@ __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fnmadd_ps(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fnmadd_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, -8.0f, 6.0f, 5.0f));
 
 __m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmadd_pd
@@ -87,6 +93,7 @@ __m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fnmadd_pd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 5.0));
 
 __m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmadd_ss
@@ -117,6 +124,7 @@ __m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fnmsub_ps(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fnmsub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 0.0f, 2.0f, 3.0f));
 
 __m128d test_mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmsub_pd
@@ -125,6 +133,7 @@ __m128d test_mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fnmsub_pd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 3.0));
 
 __m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmsub_ss
@@ -183,12 +192,14 @@ __m256 test_mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_fmadd_ps(a, b, c);
 }
+TEST_CONSTEXPR(match_m256(_mm256_fmadd_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), -0.0f, 60.0f, -62.0f, -63.0f, +56.0f, +48.0f, -32.0f, 0.0f));
 
 __m256d test_mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fmadd_pd
   // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_fmadd_pd(a, b, c);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_fmadd_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), -0.0, 0.0, -2.0, -3.0));
 
 __m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_fmsub_ps
@@ -196,6 +207,7 @@ __m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_fmsub_ps(a, b, c);
 }
+TEST_CONSTEXPR(match_m256(_mm256_fmsub_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, 68.0f, -66.0f, -65.0f, 72.0f, 80.0f, -96.0f, -128.0f));
 
 __m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fmsub_pd
@@ -203,6 +215,7 @@ __m256d test_mm2...
[truncated]

github-actions · 2025-08-20T15:31:30Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff HEAD~1 HEAD --extensions h,c -- clang/lib/Headers/fma4intrin.h clang/lib/Headers/fmaintrin.h clang/test/CodeGen/X86/fma-builtins.c clang/test/CodeGen/X86/fma4-builtins.c

View the diff from clang-format here.

diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index d8ea48902..2d08c179b 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -41,8 +41,7 @@
 ///    A 128-bit vector of [4 x float] containing the addend.
 /// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
   return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
                                            (__v4sf)__C);
 }
@@ -62,8 +61,7 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 ///    A 128-bit vector of [2 x double] containing the addend.
 /// \returns A 128-bit [2 x double] vector containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
   return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
                                             (__v2df)__C);
 }
@@ -141,8 +139,7 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
   return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
                                            -(__v4sf)__C);
 }
@@ -162,8 +159,7 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 ///    A 128-bit vector of [2 x double] containing the addend.
 /// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
   return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
                                             -(__v2df)__C);
 }
@@ -241,8 +237,7 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 ///    A 128-bit vector of [4 x float] containing the addend.
 /// \returns A 128-bit [4 x float] vector containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
   return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
                                            (__v4sf)__C);
 }
@@ -262,8 +257,7 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 ///    A 128-bit vector of [2 x double] containing the addend.
 /// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
   return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
                                             (__v2df)__C);
 }
@@ -341,8 +335,7 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
   return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
                                            -(__v4sf)__C);
 }
@@ -362,8 +355,7 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 ///    A 128-bit vector of [2 x double] containing the subtrahend.
 /// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
   return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
                                             -(__v2df)__C);
 }
@@ -545,8 +537,7 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    A 256-bit vector of [8 x float] containing the addend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
   return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
                                            (__v8sf)__C);
 }
@@ -566,8 +557,7 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 ///    A 256-bit vector of [4 x double] containing the addend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
   return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
                                             (__v4df)__C);
 }
@@ -587,8 +577,7 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
   return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
                                            -(__v8sf)__C);
 }
@@ -608,8 +597,7 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
   return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
                                             -(__v4df)__C);
 }
@@ -629,8 +617,7 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 ///    A 256-bit vector of [8 x float] containing the addend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
   return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
                                            (__v8sf)__C);
 }
@@ -650,8 +637,7 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 ///    A 256-bit vector of [4 x double] containing the addend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
   return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
                                             (__v4df)__C);
 }
@@ -671,8 +657,7 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
   return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
                                            -(__v8sf)__C);
 }
@@ -692,8 +677,7 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 /// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
   return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
                                             -(__v4df)__C);
 }

phoebewang

LGTM.

[Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr

ab819f4

Now that llvm#152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr

RKSimon requested a review from phoebewang August 20, 2025 15:28

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Aug 20, 2025

clang-format

1b7deaa

phoebewang approved these changes Aug 21, 2025

View reviewed changes

RKSimon merged commit e4b110a into llvm:main Aug 21, 2025
8 of 9 checks passed

RKSimon deleted the x86-fma34-constexpr branch August 21, 2025 08:09

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr #154558

[Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr #154558

Uh oh!

RKSimon commented Aug 20, 2025

Uh oh!

llvmbot commented Aug 20, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Aug 20, 2025 •

edited

Loading

Uh oh!

phoebewang left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr #154558

[Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr #154558

Uh oh!

Conversation

RKSimon commented Aug 20, 2025

Uh oh!

llvmbot commented Aug 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Aug 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

phoebewang left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

llvmbot commented Aug 20, 2025 •

edited

Loading

github-actions bot commented Aug 20, 2025 •

edited

Loading