Skip to content

Commit 3f8e714

Browse files
committed
[ARM,MVE] Add intrinsics and isel for MVE fused multiply-add.
Summary: This adds the ACLE intrinsic family for the VFMA and VFMS instructions, which perform fused multiply-add on vectors of floats. I've represented the unpredicated versions in IR using the cross- platform `@llvm.fma` IR intrinsic. We already had isel rules to convert one of those into a vector VFMA in the simplest possible way; but we didn't have rules to detect a negated argument and turn it into VFMS, or rules to detect a splat argument and turn it into one of the two vector/scalar forms of the instruction. Now we have all of those. The predicated form uses a target-specific intrinsic as usual, but I've stuck to just one, for a predicated FMA. The subtraction and splat versions are code-generated by passing an fneg or a splat as one of its operands, the same way as the unpredicated version. In arm_mve_defs.h, I've had to introduce a tiny extra piece of infrastructure: a record `id` for use in codegen dags which implements the identity function. (Just because you can't declare a Tablegen value of type dag which is //only// a `$varname`: you have to wrap it in something. Now I can write `(id $varname)` to get the same effect.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: dmgreen Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D75998
1 parent d608fee commit 3f8e714

File tree

7 files changed

+623
-32
lines changed

7 files changed

+623
-32
lines changed

clang/include/clang/Basic/arm_mve.td

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,46 @@ let pnt = PNT_NType in {
162162
}
163163
}
164164

165+
multiclass FMA<bit add> {
166+
// FMS instructions are defined in the ArmARM as if they negate the
167+
// second multiply input.
168+
defvar m2_cg = !if(add, (id $m2), (fneg $m2));
169+
170+
defvar unpred_cg = (IRIntBase<"fma", [Vector]> $m1, m2_cg, $addend);
171+
defvar pred_cg = (IRInt<"fma_predicated", [Vector, Predicate]>
172+
$m1, m2_cg, $addend, $pred);
173+
174+
def q: Intrinsic<Vector, (args Vector:$addend, Vector:$m1, Vector:$m2),
175+
unpred_cg>;
176+
177+
def q_m: Intrinsic<Vector, (args Vector:$addend, Vector:$m1, Vector:$m2,
178+
Predicate:$pred), pred_cg>;
179+
180+
// Only FMA has the vector/scalar variants, not FMS
181+
if add then let pnt = PNT_NType in {
182+
183+
def q_n: Intrinsic<Vector, (args Vector:$addend, Vector:$m1,
184+
unpromoted<Scalar>:$m2_s),
185+
(seq (splat $m2_s):$m2, unpred_cg)>;
186+
def sq_n: Intrinsic<Vector, (args Vector:$m1, Vector:$m2,
187+
unpromoted<Scalar>:$addend_s),
188+
(seq (splat $addend_s):$addend, unpred_cg)>;
189+
def q_m_n: Intrinsic<Vector, (args Vector:$addend, Vector:$m1,
190+
unpromoted<Scalar>:$m2_s,
191+
Predicate:$pred),
192+
(seq (splat $m2_s):$m2, pred_cg)>;
193+
def sq_m_n: Intrinsic<Vector, (args Vector:$m1, Vector:$m2,
194+
unpromoted<Scalar>:$addend_s,
195+
Predicate:$pred),
196+
(seq (splat $addend_s):$addend, pred_cg)>;
197+
}
198+
}
199+
200+
let params = T.Float in {
201+
defm vfma: FMA<1>;
202+
defm vfms: FMA<0>;
203+
}
204+
165205
let params = !listconcat(T.Int16, T.Int32) in {
166206
let pnt = PNT_None in {
167207
def vmvnq_n: Intrinsic<Vector, (args imm_simd_vmvn:$imm),

clang/include/clang/Basic/arm_mve_defs.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,18 @@ def unzip: CGHelperFn<"VectorUnzip"> {
133133
}
134134
def zip: CGHelperFn<"VectorZip">;
135135

136+
// Trivial 'codegen' function that just returns its argument. Useful
137+
// for wrapping up a variable name like $foo into a thing you can pass
138+
// around as type 'dag'.
139+
def id: IRBuilderBase {
140+
// All the other cases of IRBuilderBase use 'prefix' to specify a function
141+
// call, including the open parenthesis. MveEmitter puts the closing paren on
142+
// the end. So if we _just_ specify an open paren with no function name
143+
// before it, then the generated C++ code will simply wrap the input value in
144+
// parentheses, returning it unchanged.
145+
let prefix = "(";
146+
}
147+
136148
// Helper for making boolean flags in IR
137149
def i1: IRBuilderBase {
138150
let prefix = "llvm::ConstantInt::get(Builder.getInt1Ty(), ";
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
3+
// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -sroa | FileCheck %s
4+
5+
#include <arm_mve.h>
6+
7+
// CHECK-LABEL: @test_vfmaq_f16(
8+
// CHECK-NEXT: entry:
9+
// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[A:%.*]])
10+
// CHECK-NEXT: ret <8 x half> [[TMP0]]
11+
//
12+
float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
13+
#ifdef POLYMORPHIC
14+
return vfmaq(a, b, c);
15+
#else /* POLYMORPHIC */
16+
return vfmaq_f16(a, b, c);
17+
#endif /* POLYMORPHIC */
18+
}
19+
20+
// CHECK-LABEL: @test_vfmaq_f32(
21+
// CHECK-NEXT: entry:
22+
// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[A:%.*]])
23+
// CHECK-NEXT: ret <4 x float> [[TMP0]]
24+
//
25+
float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
26+
#ifdef POLYMORPHIC
27+
return vfmaq(a, b, c);
28+
#else /* POLYMORPHIC */
29+
return vfmaq_f32(a, b, c);
30+
#endif /* POLYMORPHIC */
31+
}
32+
33+
// CHECK-LABEL: @test_vfmaq_n_f16(
34+
// CHECK-NEXT: entry:
35+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
36+
// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
37+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
38+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
39+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
40+
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]])
41+
// CHECK-NEXT: ret <8 x half> [[TMP2]]
42+
//
43+
float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
44+
#ifdef POLYMORPHIC
45+
return vfmaq(a, b, c);
46+
#else /* POLYMORPHIC */
47+
return vfmaq_n_f16(a, b, c);
48+
#endif /* POLYMORPHIC */
49+
}
50+
51+
// CHECK-LABEL: @test_vfmaq_n_f32(
52+
// CHECK-NEXT: entry:
53+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
54+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
55+
// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]], <4 x float> [[A:%.*]])
56+
// CHECK-NEXT: ret <4 x float> [[TMP0]]
57+
//
58+
float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
59+
#ifdef POLYMORPHIC
60+
return vfmaq(a, b, c);
61+
#else /* POLYMORPHIC */
62+
return vfmaq_n_f32(a, b, c);
63+
#endif /* POLYMORPHIC */
64+
}
65+
66+
// CHECK-LABEL: @test_vfmasq_n_f16(
67+
// CHECK-NEXT: entry:
68+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
69+
// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
70+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
71+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
72+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
73+
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]])
74+
// CHECK-NEXT: ret <8 x half> [[TMP2]]
75+
//
76+
float16x8_t test_vfmasq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
77+
#ifdef POLYMORPHIC
78+
return vfmasq(a, b, c);
79+
#else /* POLYMORPHIC */
80+
return vfmasq_n_f16(a, b, c);
81+
#endif /* POLYMORPHIC */
82+
}
83+
84+
// CHECK-LABEL: @test_vfmasq_n_f32(
85+
// CHECK-NEXT: entry:
86+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
87+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
88+
// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]])
89+
// CHECK-NEXT: ret <4 x float> [[TMP0]]
90+
//
91+
float32x4_t test_vfmasq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
92+
#ifdef POLYMORPHIC
93+
return vfmasq(a, b, c);
94+
#else /* POLYMORPHIC */
95+
return vfmasq_n_f32(a, b, c);
96+
#endif /* POLYMORPHIC */
97+
}
98+
99+
// CHECK-LABEL: @test_vfmsq_f16(
100+
// CHECK-NEXT: entry:
101+
// CHECK-NEXT: [[TMP0:%.*]] = fneg <8 x half> [[C:%.*]]
102+
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[TMP0]], <8 x half> [[A:%.*]])
103+
// CHECK-NEXT: ret <8 x half> [[TMP1]]
104+
//
105+
float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
106+
#ifdef POLYMORPHIC
107+
return vfmsq(a, b, c);
108+
#else /* POLYMORPHIC */
109+
return vfmsq_f16(a, b, c);
110+
#endif /* POLYMORPHIC */
111+
}
112+
113+
// CHECK-LABEL: @test_vfmsq_f32(
114+
// CHECK-NEXT: entry:
115+
// CHECK-NEXT: [[TMP0:%.*]] = fneg <4 x float> [[C:%.*]]
116+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[A:%.*]])
117+
// CHECK-NEXT: ret <4 x float> [[TMP1]]
118+
//
119+
float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
120+
#ifdef POLYMORPHIC
121+
return vfmsq(a, b, c);
122+
#else /* POLYMORPHIC */
123+
return vfmsq_f32(a, b, c);
124+
#endif /* POLYMORPHIC */
125+
}
126+
127+
// CHECK-LABEL: @test_vfmaq_m_f16(
128+
// CHECK-NEXT: entry:
129+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
130+
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
131+
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[A:%.*]], <8 x i1> [[TMP1]])
132+
// CHECK-NEXT: ret <8 x half> [[TMP2]]
133+
//
134+
float16x8_t test_vfmaq_m_f16(float16x8_t a, float16x8_t b, float16x8_t c, mve_pred16_t p) {
135+
#ifdef POLYMORPHIC
136+
return vfmaq_m(a, b, c, p);
137+
#else /* POLYMORPHIC */
138+
return vfmaq_m_f16(a, b, c, p);
139+
#endif /* POLYMORPHIC */
140+
}
141+
142+
// CHECK-LABEL: @test_vfmaq_m_f32(
143+
// CHECK-NEXT: entry:
144+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
145+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
146+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[A:%.*]], <4 x i1> [[TMP1]])
147+
// CHECK-NEXT: ret <4 x float> [[TMP2]]
148+
//
149+
float32x4_t test_vfmaq_m_f32(float32x4_t a, float32x4_t b, float32x4_t c, mve_pred16_t p) {
150+
#ifdef POLYMORPHIC
151+
return vfmaq_m(a, b, c, p);
152+
#else /* POLYMORPHIC */
153+
return vfmaq_m_f32(a, b, c, p);
154+
#endif /* POLYMORPHIC */
155+
}
156+
157+
// CHECK-LABEL: @test_vfmaq_m_n_f16(
158+
// CHECK-NEXT: entry:
159+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
160+
// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
161+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
162+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
163+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
164+
// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
165+
// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
166+
// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]], <8 x i1> [[TMP3]])
167+
// CHECK-NEXT: ret <8 x half> [[TMP4]]
168+
//
169+
float16x8_t test_vfmaq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
170+
#ifdef POLYMORPHIC
171+
return vfmaq_m(a, b, c, p);
172+
#else /* POLYMORPHIC */
173+
return vfmaq_m_n_f16(a, b, c, p);
174+
#endif /* POLYMORPHIC */
175+
}
176+
177+
// CHECK-LABEL: @test_vfmaq_m_n_f32(
178+
// CHECK-NEXT: entry:
179+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
180+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
181+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
182+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
183+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]], <4 x float> [[A:%.*]], <4 x i1> [[TMP1]])
184+
// CHECK-NEXT: ret <4 x float> [[TMP2]]
185+
//
186+
float32x4_t test_vfmaq_m_n_f32(float32x4_t a, float32x4_t b, float32_t c, mve_pred16_t p) {
187+
#ifdef POLYMORPHIC
188+
return vfmaq_m(a, b, c, p);
189+
#else /* POLYMORPHIC */
190+
return vfmaq_m_n_f32(a, b, c, p);
191+
#endif /* POLYMORPHIC */
192+
}
193+
194+
// CHECK-LABEL: @test_vfmasq_m_n_f16(
195+
// CHECK-NEXT: entry:
196+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
197+
// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
198+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
199+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
200+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
201+
// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
202+
// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
203+
// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]])
204+
// CHECK-NEXT: ret <8 x half> [[TMP4]]
205+
//
206+
float16x8_t test_vfmasq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
207+
#ifdef POLYMORPHIC
208+
return vfmasq_m(a, b, c, p);
209+
#else /* POLYMORPHIC */
210+
return vfmasq_m_n_f16(a, b, c, p);
211+
#endif /* POLYMORPHIC */
212+
}
213+
214+
// CHECK-LABEL: @test_vfmasq_m_n_f32(
215+
// CHECK-NEXT: entry:
216+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[C:%.*]], i32 0
217+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
218+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
219+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
220+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[DOTSPLAT]], <4 x i1> [[TMP1]])
221+
// CHECK-NEXT: ret <4 x float> [[TMP2]]
222+
//
223+
float32x4_t test_vfmasq_m_n_f32(float32x4_t a, float32x4_t b, float32_t c, mve_pred16_t p) {
224+
#ifdef POLYMORPHIC
225+
return vfmasq_m(a, b, c, p);
226+
#else /* POLYMORPHIC */
227+
return vfmasq_m_n_f32(a, b, c, p);
228+
#endif /* POLYMORPHIC */
229+
}
230+
231+
// CHECK-LABEL: @test_vfmsq_m_f16(
232+
// CHECK-NEXT: entry:
233+
// CHECK-NEXT: [[TMP0:%.*]] = fneg <8 x half> [[C:%.*]]
234+
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
235+
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
236+
// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[TMP0]], <8 x half> [[A:%.*]], <8 x i1> [[TMP2]])
237+
// CHECK-NEXT: ret <8 x half> [[TMP3]]
238+
//
239+
float16x8_t test_vfmsq_m_f16(float16x8_t a, float16x8_t b, float16x8_t c, mve_pred16_t p) {
240+
#ifdef POLYMORPHIC
241+
return vfmsq_m(a, b, c, p);
242+
#else /* POLYMORPHIC */
243+
return vfmsq_m_f16(a, b, c, p);
244+
#endif /* POLYMORPHIC */
245+
}
246+
247+
// CHECK-LABEL: @test_vfmsq_m_f32(
248+
// CHECK-NEXT: entry:
249+
// CHECK-NEXT: [[TMP0:%.*]] = fneg <4 x float> [[C:%.*]]
250+
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
251+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
252+
// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[A:%.*]], <4 x i1> [[TMP2]])
253+
// CHECK-NEXT: ret <4 x float> [[TMP3]]
254+
//
255+
float32x4_t test_vfmsq_m_f32(float32x4_t a, float32x4_t b, float32x4_t c, mve_pred16_t p) {
256+
#ifdef POLYMORPHIC
257+
return vfmsq_m(a, b, c, p);
258+
#else /* POLYMORPHIC */
259+
return vfmsq_m_f32(a, b, c, p);
260+
#endif /* POLYMORPHIC */
261+
}

llvm/include/llvm/IR/IntrinsicsARM.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,6 +1243,10 @@ def int_arm_mve_vqmovn_predicated: Intrinsic<[llvm_anyvector_ty],
12431243
llvm_i32_ty /* unsigned output */, llvm_i32_ty /* unsigned input */,
12441244
llvm_i32_ty /* top half */, llvm_anyvector_ty /* pred */], [IntrNoMem]>;
12451245

1246+
def int_arm_mve_fma_predicated: Intrinsic<[llvm_anyvector_ty],
1247+
[LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */,
1248+
LLVMMatchType<0> /* addend */, llvm_anyvector_ty /* pred */], [IntrNoMem]>;
1249+
12461250
// CDE (Custom Datapath Extension)
12471251

12481252
def int_arm_cde_cx1: Intrinsic<

0 commit comments

Comments
 (0)