Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 42 additions & 48 deletions clang/lib/Headers/fma4intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,22 @@
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))

static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
{
#if defined(__cplusplus) && (__cplusplus >= 201103L)
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
#else
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
#endif

static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}

static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
{
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
Expand All @@ -46,16 +52,14 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}

static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}

static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
{
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
Expand All @@ -72,16 +76,14 @@ _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}

static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
{
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}

static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
{
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
Expand All @@ -98,16 +100,14 @@ _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}

static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}

static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
Expand Down Expand Up @@ -148,58 +148,50 @@ _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}

static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
{
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}

static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
{
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}

static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}

static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
{
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}

static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
{
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}

static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
{
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
(__v4df)__C);
}

static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}

static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
Expand Down Expand Up @@ -230,5 +222,7 @@ _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)

#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
#undef __DEFAULT_FN_ATTRS256_CONSTEXPR

#endif /* __FMA4INTRIN_H */
42 changes: 26 additions & 16 deletions clang/lib/Headers/fmaintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))

#if defined(__cplusplus) && (__cplusplus >= 201103L)
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
#else
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
#endif

/// Computes a multiply-add of 128-bit vectors of [4 x float].
/// For each element, computes <c> (__A * __B) + __C </c>.
///
Expand All @@ -32,7 +40,7 @@
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
Expand All @@ -53,7 +61,7 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit [2 x double] vector containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
Expand Down Expand Up @@ -132,7 +140,7 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
Expand All @@ -153,7 +161,7 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
Expand Down Expand Up @@ -232,7 +240,7 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit [4 x float] vector containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
Expand All @@ -253,7 +261,7 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
Expand Down Expand Up @@ -332,7 +340,7 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
Expand All @@ -353,7 +361,7 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
/// \param __C
/// A 128-bit vector of [2 x double] containing the subtrahend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
Expand Down Expand Up @@ -536,7 +544,7 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
Expand All @@ -557,7 +565,7 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
Expand All @@ -578,7 +586,7 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
Expand All @@ -599,7 +607,7 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
Expand All @@ -620,7 +628,7 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
Expand All @@ -641,7 +649,7 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
Expand All @@ -662,7 +670,7 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
/// \param __C
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
Expand All @@ -683,7 +691,7 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
/// \param __C
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
Expand Down Expand Up @@ -808,5 +816,7 @@ _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)

#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
#undef __DEFAULT_FN_ATTRS256_CONSTEXPR

#endif /* __FMAINTRIN_H */
Loading
Loading