-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr #154558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Now that llvm#152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesNow that #152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr Fixes #154555 Patch is 31.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154558.diff 4 Files Affected:
diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
index 69977fb2ccee3..05261f6389274 100644
--- a/clang/lib/Headers/fma4intrin.h
+++ b/clang/lib/Headers/fma4intrin.h
@@ -20,14 +20,22 @@
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -46,14 +54,14 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -72,14 +80,14 @@ _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -98,14 +106,14 @@ _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -148,56 +156,56 @@ _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
@@ -230,5 +238,7 @@ _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#endif /* __FMA4INTRIN_H */
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index 24584a928da4d..d8ea489022b8f 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -18,6 +18,14 @@
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
/// Computes a multiply-add of 128-bit vectors of [4 x float].
/// For each element, computes <c> (__A * __B) + __C </c>.
///
@@ -32,7 +40,7 @@
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
@@ -53,7 +61,7 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit [2 x double] vector containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -132,7 +140,7 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
@@ -153,7 +161,7 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
@@ -232,7 +240,7 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit [4 x float] vector containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
@@ -253,7 +261,7 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -332,7 +340,7 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
@@ -353,7 +361,7 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the subtrahend.
/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
@@ -536,7 +544,7 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
@@ -557,7 +565,7 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
@@ -578,7 +586,7 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
@@ -599,7 +607,7 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
@@ -620,7 +628,7 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
@@ -641,7 +649,7 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
@@ -662,7 +670,7 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
@@ -683,7 +691,7 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
@@ -808,5 +816,7 @@ _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#endif /* __FMAINTRIN_H */
diff --git a/clang/test/CodeGen/X86/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c
index aa17dcc62fbc0..8e9822ec6ad2f 100644
--- a/clang/test/CodeGen/X86/fma-builtins.c
+++ b/clang/test/CodeGen/X86/fma-builtins.c
@@ -5,18 +5,21 @@
#include <immintrin.h>
+#include "builtin_test_helpers.h"
__m128 test_mm_fmadd_ps(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fmadd_ps
// CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
return _mm_fmadd_ps(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fmadd_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), -0.0f, 0.0f, -2.0f, -3.0f));
__m128d test_mm_fmadd_pd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fmadd_pd
// CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
return _mm_fmadd_pd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), -0.0, -3.0));
__m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fmadd_ss
@@ -44,6 +47,7 @@ __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
// CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
return _mm_fmsub_ps(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fmsub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 8.0f, -6.0f, -5.0f));
__m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fmsub_pd
@@ -51,6 +55,7 @@ __m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) {
// CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
return _mm_fmsub_pd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, -5.0));
__m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fmsub_ss
@@ -80,6 +85,7 @@ __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
// CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
return _mm_fnmadd_ps(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fnmadd_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, -8.0f, 6.0f, 5.0f));
__m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fnmadd_pd
@@ -87,6 +93,7 @@ __m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) {
// CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
return _mm_fnmadd_pd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 5.0));
__m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fnmadd_ss
@@ -117,6 +124,7 @@ __m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
// CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
return _mm_fnmsub_ps(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fnmsub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 0.0f, 2.0f, 3.0f));
__m128d test_mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fnmsub_pd
@@ -125,6 +133,7 @@ __m128d test_mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) {
// CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
return _mm_fnmsub_pd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 3.0));
__m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fnmsub_ss
@@ -183,12 +192,14 @@ __m256 test_mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) {
// CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
return _mm256_fmadd_ps(a, b, c);
}
+TEST_CONSTEXPR(match_m256(_mm256_fmadd_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), -0.0f, 60.0f, -62.0f, -63.0f, +56.0f, +48.0f, -32.0f, 0.0f));
__m256d test_mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) {
// CHECK-LABEL: test_mm256_fmadd_pd
// CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
return _mm256_fmadd_pd(a, b, c);
}
+TEST_CONSTEXPR(match_m256d(_mm256_fmadd_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), -0.0, 0.0, -2.0, -3.0));
__m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) {
// CHECK-LABEL: test_mm256_fmsub_ps
@@ -196,6 +207,7 @@ __m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) {
// CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
return _mm256_fmsub_ps(a, b, c);
}
+TEST_CONSTEXPR(match_m256(_mm256_fmsub_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, 68.0f, -66.0f, -65.0f, 72.0f, 80.0f, -96.0f, -128.0f));
__m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) {
// CHECK-LABEL: test_mm256_fmsub_pd
@@ -203,6 +215,7 @@ __m256d test_mm2...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions h,c -- clang/lib/Headers/fma4intrin.h clang/lib/Headers/fmaintrin.h clang/test/CodeGen/X86/fma-builtins.c clang/test/CodeGen/X86/fma4-builtins.cView the diff from clang-format here.diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index d8ea48902..2d08c179b 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -41,8 +41,7 @@
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
@@ -62,8 +61,7 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit [2 x double] vector containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
@@ -141,8 +139,7 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
@@ -162,8 +159,7 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
@@ -241,8 +237,7 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit [4 x float] vector containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
@@ -262,8 +257,7 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
@@ -341,8 +335,7 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
@@ -362,8 +355,7 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
/// A 128-bit vector of [2 x double] containing the subtrahend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
@@ -545,8 +537,7 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
@@ -566,8 +557,7 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
@@ -587,8 +577,7 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
@@ -608,8 +597,7 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
@@ -629,8 +617,7 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
@@ -650,8 +637,7 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
@@ -671,8 +657,7 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
@@ -692,8 +677,7 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
|
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Now that #152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr
Fixes #154555