Skip to content

Commit 99058a3

Browse files
committed
[AArch64] Improve code generation for experimental.cttz.elts
This patch extends support for lowering the experimental.cttz.elts intrinsic to BRKB + CNTP instruction sequences, using this lowering for all legal predicate types. An unused parameter is also removed from some of the related regression tests.
1 parent e6d29be commit 99058a3

File tree

3 files changed

+205
-54
lines changed

3 files changed

+205
-54
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1852,7 +1852,16 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
18521852
}
18531853

18541854
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1855-
return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1855+
// Only SVE and SME architectures support BRKB and CNTP instructions.
1856+
if (!Subtarget->hasSVEorSME())
1857+
return true;
1858+
1859+
// We can only use the BRKB + CNTP sequence with legal predicate types.
1860+
if (VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1861+
VT != MVT::nxv2i1)
1862+
return true;
1863+
1864+
return false;
18561865
}
18571866

18581867
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2075,6 +2075,18 @@ let Predicates = [HasSVEorSME] in {
20752075
def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
20762076
(CNTP_XPP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op1),
20772077
(BRKB_PPzP (PTRUE_B 31), PPR:$Op1))>;
2078+
2079+
def : Pat<(i64 (AArch64CttzElts nxv8i1:$Op1)),
2080+
(CNTP_XPP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op1),
2081+
(BRKB_PPzP (PTRUE_H 31), PPR:$Op1))>;
2082+
2083+
def : Pat<(i64 (AArch64CttzElts nxv4i1:$Op1)),
2084+
(CNTP_XPP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op1),
2085+
(BRKB_PPzP (PTRUE_S 31), PPR:$Op1))>;
2086+
2087+
def : Pat<(i64 (AArch64CttzElts nxv2i1:$Op1)),
2088+
(CNTP_XPP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op1),
2089+
(BRKB_PPzP (PTRUE_D 31), PPR:$Op1))>;
20782090
}
20792091

20802092
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2168,6 +2180,30 @@ let Predicates = [HasSVEorSME] in {
21682180
(INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
21692181
sub_32)>;
21702182

2183+
def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv8i1:$Op2)))),
2184+
(INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2), GPR64:$Op1)>;
2185+
2186+
def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv8i1:$Op2))))),
2187+
(EXTRACT_SUBREG (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2),
2188+
(INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
2189+
sub_32)>;
2190+
2191+
def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv4i1:$Op2)))),
2192+
(INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2), GPR64:$Op1)>;
2193+
2194+
def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv4i1:$Op2))))),
2195+
(EXTRACT_SUBREG (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2),
2196+
(INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
2197+
sub_32)>;
2198+
2199+
def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv2i1:$Op2)))),
2200+
(INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2), GPR64:$Op1)>;
2201+
2202+
def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv2i1:$Op2))))),
2203+
(EXTRACT_SUBREG (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2),
2204+
(INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
2205+
sub_32)>;
2206+
21712207
defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
21722208
defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
21732209
defm INDEX_RI : sve_int_index_ri<"index">;

llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll

Lines changed: 159 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,6 @@
44

55
; WITH VSCALE RANGE
66

7-
define i64 @ctz_nxv8i1(<vscale x 8 x i1> %a) #0 {
8-
; CHECK-LABEL: ctz_nxv8i1:
9-
; CHECK: // %bb.0:
10-
; CHECK-NEXT: index z0.h, #0, #-1
11-
; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff
12-
; CHECK-NEXT: ptrue p0.h
13-
; CHECK-NEXT: cnth x9
14-
; CHECK-NEXT: inch z0.h
15-
; CHECK-NEXT: and z0.d, z0.d, z1.d
16-
; CHECK-NEXT: and z0.h, z0.h, #0xff
17-
; CHECK-NEXT: umaxv h0, p0, z0.h
18-
; CHECK-NEXT: fmov w8, s0
19-
; CHECK-NEXT: sub w8, w9, w8
20-
; CHECK-NEXT: and x0, x8, #0xff
21-
; CHECK-NEXT: ret
22-
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 0)
23-
ret i64 %res
24-
}
25-
267
define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
278
; CHECK-LABEL: ctz_nxv32i1:
289
; CHECK: // %bb.0:
@@ -156,53 +137,178 @@ define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
156137
ret i64 %res
157138
}
158139

159-
; NO VSCALE RANGE
140+
; MATCH WITH BRKB + CNTP
160141

161-
define i32 @ctz_nxv8i1_no_range(<vscale x 8 x i1> %a) {
162-
; CHECK-LABEL: ctz_nxv8i1_no_range:
142+
define i32 @ctz_nxv2i1(<vscale x 2 x i1> %a) {
143+
; CHECK-LABEL: ctz_nxv2i1:
163144
; CHECK: // %bb.0:
164-
; CHECK-NEXT: index z0.s, #0, #-1
165-
; CHECK-NEXT: cntw x8
166-
; CHECK-NEXT: punpklo p1.h, p0.b
167-
; CHECK-NEXT: neg x8, x8
168-
; CHECK-NEXT: punpkhi p0.h, p0.b
169-
; CHECK-NEXT: cnth x9
170-
; CHECK-NEXT: mov z1.s, w8
171-
; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
172-
; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff
173-
; CHECK-NEXT: ptrue p0.s
174-
; CHECK-NEXT: incw z0.s, all, mul #2
175-
; CHECK-NEXT: add z1.s, z0.s, z1.s
176-
; CHECK-NEXT: and z0.d, z0.d, z2.d
177-
; CHECK-NEXT: and z1.d, z1.d, z3.d
178-
; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
179-
; CHECK-NEXT: umaxv s0, p0, z0.s
180-
; CHECK-NEXT: fmov w8, s0
181-
; CHECK-NEXT: sub w0, w9, w8
145+
; CHECK-NEXT: ptrue p1.d
146+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
147+
; CHECK-NEXT: cntp x0, p0, p0.d
148+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
149+
; CHECK-NEXT: ret
150+
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 0)
151+
ret i32 %res
152+
}
153+
154+
define i32 @ctz_nxv2i1_poison(<vscale x 2 x i1> %a) {
155+
; CHECK-LABEL: ctz_nxv2i1_poison:
156+
; CHECK: // %bb.0:
157+
; CHECK-NEXT: ptrue p1.d
158+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
159+
; CHECK-NEXT: cntp x0, p0, p0.d
160+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
161+
; CHECK-NEXT: ret
162+
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 1)
163+
ret i32 %res
164+
}
165+
166+
define i64 @add_i64_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i64 %b) {
167+
; CHECK-LABEL: add_i64_ctz_nxv2i1_poison:
168+
; CHECK: // %bb.0:
169+
; CHECK-NEXT: ptrue p1.d
170+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
171+
; CHECK-NEXT: incp x0, p0.d
172+
; CHECK-NEXT: ret
173+
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
174+
%add = add i64 %res, %b
175+
ret i64 %add
176+
}
177+
178+
define i32 @add_i32_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i32 %b) {
179+
; CHECK-LABEL: add_i32_ctz_nxv2i1_poison:
180+
; CHECK: // %bb.0:
181+
; CHECK-NEXT: ptrue p1.d
182+
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
183+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
184+
; CHECK-NEXT: incp x0, p0.d
185+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
186+
; CHECK-NEXT: ret
187+
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
188+
%trunc = trunc i64 %res to i32
189+
%add = add i32 %trunc, %b
190+
ret i32 %add
191+
}
192+
193+
define i32 @ctz_nxv4i1(<vscale x 4 x i1> %a) {
194+
; CHECK-LABEL: ctz_nxv4i1:
195+
; CHECK: // %bb.0:
196+
; CHECK-NEXT: ptrue p1.s
197+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
198+
; CHECK-NEXT: cntp x0, p0, p0.s
199+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
200+
; CHECK-NEXT: ret
201+
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 0)
202+
ret i32 %res
203+
}
204+
205+
define i32 @ctz_nxv4i1_poison(<vscale x 4 x i1> %a) {
206+
; CHECK-LABEL: ctz_nxv4i1_poison:
207+
; CHECK: // %bb.0:
208+
; CHECK-NEXT: ptrue p1.s
209+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
210+
; CHECK-NEXT: cntp x0, p0, p0.s
211+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
212+
; CHECK-NEXT: ret
213+
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 1)
214+
ret i32 %res
215+
}
216+
217+
define i64 @add_i64_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i64 %b) {
218+
; CHECK-LABEL: add_i64_ctz_nxv4i1_poison:
219+
; CHECK: // %bb.0:
220+
; CHECK-NEXT: ptrue p1.s
221+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
222+
; CHECK-NEXT: incp x0, p0.s
223+
; CHECK-NEXT: ret
224+
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
225+
%add = add i64 %res, %b
226+
ret i64 %add
227+
}
228+
229+
define i32 @add_i32_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i32 %b) {
230+
; CHECK-LABEL: add_i32_ctz_nxv4i1_poison:
231+
; CHECK: // %bb.0:
232+
; CHECK-NEXT: ptrue p1.s
233+
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
234+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
235+
; CHECK-NEXT: incp x0, p0.s
236+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
237+
; CHECK-NEXT: ret
238+
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
239+
%trunc = trunc i64 %res to i32
240+
%add = add i32 %trunc, %b
241+
ret i32 %add
242+
}
243+
244+
define i32 @ctz_nxv8i1(<vscale x 8 x i1> %a) {
245+
; CHECK-LABEL: ctz_nxv8i1:
246+
; CHECK: // %bb.0:
247+
; CHECK-NEXT: ptrue p1.h
248+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
249+
; CHECK-NEXT: cntp x0, p0, p0.h
250+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
182251
; CHECK-NEXT: ret
183252
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 0)
184253
ret i32 %res
185254
}
186255

187-
; MATCH WITH BRKB + CNTP
256+
define i32 @ctz_nxv8i1_poison(<vscale x 8 x i1> %a) {
257+
; CHECK-LABEL: ctz_nxv8i1_poison:
258+
; CHECK: // %bb.0:
259+
; CHECK-NEXT: ptrue p1.h
260+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
261+
; CHECK-NEXT: cntp x0, p0, p0.h
262+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
263+
; CHECK-NEXT: ret
264+
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 1)
265+
ret i32 %res
266+
}
267+
268+
define i64 @add_i64_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i64 %b) {
269+
; CHECK-LABEL: add_i64_ctz_nxv8i1_poison:
270+
; CHECK: // %bb.0:
271+
; CHECK-NEXT: ptrue p1.h
272+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
273+
; CHECK-NEXT: incp x0, p0.h
274+
; CHECK-NEXT: ret
275+
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
276+
%add = add i64 %res, %b
277+
ret i64 %add
278+
}
188279

189-
define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
280+
define i32 @add_i32_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i32 %b) {
281+
; CHECK-LABEL: add_i32_ctz_nxv8i1_poison:
282+
; CHECK: // %bb.0:
283+
; CHECK-NEXT: ptrue p1.h
284+
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
285+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
286+
; CHECK-NEXT: incp x0, p0.h
287+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
288+
; CHECK-NEXT: ret
289+
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
290+
%trunc = trunc i64 %res to i32
291+
%add = add i32 %trunc, %b
292+
ret i32 %add
293+
}
294+
295+
define i32 @ctz_nxv16i1(<vscale x 16 x i1> %a) {
190296
; CHECK-LABEL: ctz_nxv16i1:
191297
; CHECK: // %bb.0:
192-
; CHECK-NEXT: ptrue p0.b
193-
; CHECK-NEXT: brkb p0.b, p0/z, p1.b
298+
; CHECK-NEXT: ptrue p1.b
299+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
194300
; CHECK-NEXT: cntp x0, p0, p0.b
195301
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
196302
; CHECK-NEXT: ret
197303
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
198304
ret i32 %res
199305
}
200306

201-
define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
307+
define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
202308
; CHECK-LABEL: ctz_nxv16i1_poison:
203309
; CHECK: // %bb.0:
204-
; CHECK-NEXT: ptrue p0.b
205-
; CHECK-NEXT: brkb p0.b, p0/z, p1.b
310+
; CHECK-NEXT: ptrue p1.b
311+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
206312
; CHECK-NEXT: cntp x0, p0, p0.b
207313
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
208314
; CHECK-NEXT: ret
@@ -226,24 +332,24 @@ define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vsca
226332
ret i32 %res
227333
}
228334

229-
define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i64 %b) {
335+
define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i64 %b) {
230336
; CHECK-LABEL: add_i64_ctz_nxv16i1_poison:
231337
; CHECK: // %bb.0:
232-
; CHECK-NEXT: ptrue p0.b
233-
; CHECK-NEXT: brkb p0.b, p0/z, p1.b
338+
; CHECK-NEXT: ptrue p1.b
339+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
234340
; CHECK-NEXT: incp x0, p0.b
235341
; CHECK-NEXT: ret
236342
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
237343
%add = add i64 %res, %b
238344
ret i64 %add
239345
}
240346

241-
define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i32 %b) {
347+
define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
242348
; CHECK-LABEL: add_i32_ctz_nxv16i1_poison:
243349
; CHECK: // %bb.0:
244-
; CHECK-NEXT: ptrue p0.b
350+
; CHECK-NEXT: ptrue p1.b
245351
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
246-
; CHECK-NEXT: brkb p0.b, p0/z, p1.b
352+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
247353
; CHECK-NEXT: incp x0, p0.b
248354
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
249355
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)