Skip to content

Commit 70d345e

Browse files
committed
[AArch64][ARM] Always expand ordered vector reductions (PR44600)
fadd/fmul reductions without reassoc are lowered to VECREDUCE_STRICT_FADD/FMUL nodes, which don't have legalization support. Until that is in place, expand these intrinsics on ARM and AArch64. Other targets always expand the vector reduction intrinsics. Additionally expand fmax/fmin reductions without nonan flag on AArch64, as the backend asserts that the flag is present when lowering VECREDUCE_FMIN/FMAX. This fixes https://bugs.llvm.org/show_bug.cgi?id=44600. Differential Revision: https://reviews.llvm.org/D73135
1 parent 3ae11b4 commit 70d345e

File tree

7 files changed

+687
-2
lines changed

7 files changed

+687
-2
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,21 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
201201
bool &AllowPromotionWithoutCommonHeader);
202202

203203
bool shouldExpandReduction(const IntrinsicInst *II) const {
204-
return false;
204+
switch (II->getIntrinsicID()) {
205+
case Intrinsic::experimental_vector_reduce_v2_fadd:
206+
case Intrinsic::experimental_vector_reduce_v2_fmul:
207+
// We don't have legalization support for ordered FP reductions.
208+
return !II->getFastMathFlags().allowReassoc();
209+
210+
case Intrinsic::experimental_vector_reduce_fmax:
211+
case Intrinsic::experimental_vector_reduce_fmin:
212+
// Lowering asserts that there are no NaNs.
213+
return !II->getFastMathFlags().noNaNs();
214+
215+
default:
216+
// Don't expand anything else, let legalization deal with it.
217+
return false;
218+
}
205219
}
206220

207221
unsigned getGISelRematGlobalCost() const {

llvm/lib/Target/ARM/ARMTargetTransformInfo.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,16 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
170170
TTI::ReductionFlags Flags) const;
171171

172172
bool shouldExpandReduction(const IntrinsicInst *II) const {
173-
return false;
173+
switch (II->getIntrinsicID()) {
174+
case Intrinsic::experimental_vector_reduce_v2_fadd:
175+
case Intrinsic::experimental_vector_reduce_v2_fmul:
176+
// We don't have legalization support for ordered FP reductions.
177+
return !II->getFastMathFlags().allowReassoc();
178+
179+
default:
180+
// Don't expand anything else, let legalization deal with it.
181+
return false;
182+
}
174183
}
175184

176185
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
3+
4+
; Same as vecreduce-fadd-legalization.ll, but without fmf.
5+
6+
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>)
7+
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>)
8+
declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>)
9+
declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>)
10+
11+
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>)
12+
declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>)
13+
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>)
14+
15+
define half @test_v1f16(<1 x half> %a) nounwind {
16+
; CHECK-LABEL: test_v1f16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: fcvt s0, h0
19+
; CHECK-NEXT: fmov s1, wzr
20+
; CHECK-NEXT: fadd s0, s0, s1
21+
; CHECK-NEXT: fcvt h0, s0
22+
; CHECK-NEXT: ret
23+
%b = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a)
24+
ret half %b
25+
}
26+
27+
define float @test_v1f32(<1 x float> %a) nounwind {
28+
; CHECK-LABEL: test_v1f32:
29+
; CHECK: // %bb.0:
30+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
31+
; CHECK-NEXT: fmov s1, wzr
32+
; CHECK-NEXT: fadd s0, s0, s1
33+
; CHECK-NEXT: ret
34+
%b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a)
35+
ret float %b
36+
}
37+
38+
define double @test_v1f64(<1 x double> %a) nounwind {
39+
; CHECK-LABEL: test_v1f64:
40+
; CHECK: // %bb.0:
41+
; CHECK-NEXT: fmov d1, xzr
42+
; CHECK-NEXT: fadd d0, d0, d1
43+
; CHECK-NEXT: ret
44+
%b = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a)
45+
ret double %b
46+
}
47+
48+
define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
49+
; CHECK-LABEL: test_v1f128:
50+
; CHECK: // %bb.0:
51+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
52+
; CHECK-NEXT: adrp x8, .LCPI3_0
53+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
54+
; CHECK-NEXT: bl __addtf3
55+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
56+
; CHECK-NEXT: ret
57+
%b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
58+
ret fp128 %b
59+
}
60+
61+
define float @test_v3f32(<3 x float> %a) nounwind {
62+
; CHECK-LABEL: test_v3f32:
63+
; CHECK: // %bb.0:
64+
; CHECK-NEXT: fmov s1, wzr
65+
; CHECK-NEXT: mov s2, v0.s[1]
66+
; CHECK-NEXT: fadd s1, s0, s1
67+
; CHECK-NEXT: fadd s1, s1, s2
68+
; CHECK-NEXT: mov s0, v0.s[2]
69+
; CHECK-NEXT: fadd s0, s1, s0
70+
; CHECK-NEXT: ret
71+
%b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a)
72+
ret float %b
73+
}
74+
75+
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
76+
; CHECK-LABEL: test_v2f128:
77+
; CHECK: // %bb.0:
78+
; CHECK-NEXT: sub sp, sp, #32 // =32
79+
; CHECK-NEXT: adrp x8, .LCPI5_0
80+
; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill
81+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
82+
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
83+
; CHECK-NEXT: bl __addtf3
84+
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
85+
; CHECK-NEXT: bl __addtf3
86+
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
87+
; CHECK-NEXT: add sp, sp, #32 // =32
88+
; CHECK-NEXT: ret
89+
%b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
90+
ret fp128 %b
91+
}
92+
93+
define float @test_v16f32(<16 x float> %a) nounwind {
94+
; CHECK-LABEL: test_v16f32:
95+
; CHECK: // %bb.0:
96+
; CHECK-NEXT: fmov s4, wzr
97+
; CHECK-NEXT: mov s5, v0.s[1]
98+
; CHECK-NEXT: fadd s4, s0, s4
99+
; CHECK-NEXT: fadd s4, s4, s5
100+
; CHECK-NEXT: mov s5, v0.s[2]
101+
; CHECK-NEXT: mov s0, v0.s[3]
102+
; CHECK-NEXT: fadd s4, s4, s5
103+
; CHECK-NEXT: fadd s0, s4, s0
104+
; CHECK-NEXT: mov s5, v1.s[1]
105+
; CHECK-NEXT: fadd s0, s0, s1
106+
; CHECK-NEXT: mov s4, v1.s[2]
107+
; CHECK-NEXT: fadd s0, s0, s5
108+
; CHECK-NEXT: mov s1, v1.s[3]
109+
; CHECK-NEXT: fadd s0, s0, s4
110+
; CHECK-NEXT: fadd s0, s0, s1
111+
; CHECK-NEXT: mov s5, v2.s[1]
112+
; CHECK-NEXT: fadd s0, s0, s2
113+
; CHECK-NEXT: mov s4, v2.s[2]
114+
; CHECK-NEXT: fadd s0, s0, s5
115+
; CHECK-NEXT: mov s1, v2.s[3]
116+
; CHECK-NEXT: fadd s0, s0, s4
117+
; CHECK-NEXT: fadd s0, s0, s1
118+
; CHECK-NEXT: mov s2, v3.s[1]
119+
; CHECK-NEXT: fadd s0, s0, s3
120+
; CHECK-NEXT: mov s5, v3.s[2]
121+
; CHECK-NEXT: fadd s0, s0, s2
122+
; CHECK-NEXT: fadd s0, s0, s5
123+
; CHECK-NEXT: mov s1, v3.s[3]
124+
; CHECK-NEXT: fadd s0, s0, s1
125+
; CHECK-NEXT: ret
126+
%b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a)
127+
ret float %b
128+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
3+
4+
declare half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
5+
declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
6+
declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
7+
declare fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
8+
9+
declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
10+
declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
11+
declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
12+
13+
define half @test_v1f16(<1 x half> %a) nounwind {
14+
; CHECK-LABEL: test_v1f16:
15+
; CHECK: // %bb.0:
16+
; CHECK-NEXT: ret
17+
%b = call half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a)
18+
ret half %b
19+
}
20+
21+
define float @test_v1f32(<1 x float> %a) nounwind {
22+
; CHECK-LABEL: test_v1f32:
23+
; CHECK: // %bb.0:
24+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
25+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
26+
; CHECK-NEXT: ret
27+
%b = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a)
28+
ret float %b
29+
}
30+
31+
define double @test_v1f64(<1 x double> %a) nounwind {
32+
; CHECK-LABEL: test_v1f64:
33+
; CHECK: // %bb.0:
34+
; CHECK-NEXT: ret
35+
%b = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a)
36+
ret double %b
37+
}
38+
39+
define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
40+
; CHECK-LABEL: test_v1f128:
41+
; CHECK: // %bb.0:
42+
; CHECK-NEXT: ret
43+
%b = call fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a)
44+
ret fp128 %b
45+
}
46+
47+
; TODO: This doesn't work, because ExpandReductions only supports power of two
48+
; unordered reductions.
49+
;define float @test_v3f32(<3 x float> %a) nounwind {
50+
; %b = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a)
51+
; ret float %b
52+
;}
53+
54+
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
55+
; CHECK-LABEL: test_v2f128:
56+
; CHECK: // %bb.0:
57+
; CHECK-NEXT: sub sp, sp, #48 // =48
58+
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
59+
; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
60+
; CHECK-NEXT: bl __gttf2
61+
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
62+
; CHECK-NEXT: cmp w0, #0 // =0
63+
; CHECK-NEXT: b.le .LBB4_2
64+
; CHECK-NEXT: // %bb.1:
65+
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
66+
; CHECK-NEXT: .LBB4_2:
67+
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
68+
; CHECK-NEXT: add sp, sp, #48 // =48
69+
; CHECK-NEXT: ret
70+
%b = call fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a)
71+
ret fp128 %b
72+
}
73+
74+
define float @test_v16f32(<16 x float> %a) nounwind {
75+
; CHECK-LABEL: test_v16f32:
76+
; CHECK: // %bb.0:
77+
; CHECK-NEXT: fmaxnm v1.4s, v1.4s, v3.4s
78+
; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v2.4s
79+
; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
80+
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
81+
; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
82+
; CHECK-NEXT: dup v1.4s, v0.s[1]
83+
; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
84+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
85+
; CHECK-NEXT: ret
86+
%b = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a)
87+
ret float %b
88+
}
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
3+
4+
; Same as vecreduce-fmul-legalization.ll, but without fmf.
5+
6+
declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half, <1 x half>)
7+
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float, <1 x float>)
8+
declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>)
9+
declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128, <1 x fp128>)
10+
11+
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float, <3 x float>)
12+
declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>)
13+
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>)
14+
15+
define half @test_v1f16(<1 x half> %a) nounwind {
16+
; CHECK-LABEL: test_v1f16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: fcvt s0, h0
19+
; CHECK-NEXT: fmov s1, wzr
20+
; CHECK-NEXT: fmul s0, s0, s1
21+
; CHECK-NEXT: fcvt h0, s0
22+
; CHECK-NEXT: ret
23+
%b = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half 0.0, <1 x half> %a)
24+
ret half %b
25+
}
26+
27+
define float @test_v1f32(<1 x float> %a) nounwind {
28+
; CHECK-LABEL: test_v1f32:
29+
; CHECK: // %bb.0:
30+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
31+
; CHECK-NEXT: fmov s1, wzr
32+
; CHECK-NEXT: fmul s0, s1, v0.s[0]
33+
; CHECK-NEXT: ret
34+
%b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float 0.0, <1 x float> %a)
35+
ret float %b
36+
}
37+
38+
define double @test_v1f64(<1 x double> %a) nounwind {
39+
; CHECK-LABEL: test_v1f64:
40+
; CHECK: // %bb.0:
41+
; CHECK-NEXT: fmov d1, xzr
42+
; CHECK-NEXT: fmul d0, d0, d1
43+
; CHECK-NEXT: ret
44+
%b = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double 0.0, <1 x double> %a)
45+
ret double %b
46+
}
47+
48+
define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
49+
; CHECK-LABEL: test_v1f128:
50+
; CHECK: // %bb.0:
51+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
52+
; CHECK-NEXT: adrp x8, .LCPI3_0
53+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
54+
; CHECK-NEXT: bl __multf3
55+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
56+
; CHECK-NEXT: ret
57+
%b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a)
58+
ret fp128 %b
59+
}
60+
61+
define float @test_v3f32(<3 x float> %a) nounwind {
62+
; CHECK-LABEL: test_v3f32:
63+
; CHECK: // %bb.0:
64+
; CHECK-NEXT: fmov s1, wzr
65+
; CHECK-NEXT: fmul s1, s1, v0.s[0]
66+
; CHECK-NEXT: fmul s1, s1, v0.s[1]
67+
; CHECK-NEXT: fmul s0, s1, v0.s[2]
68+
; CHECK-NEXT: ret
69+
%b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float 0.0, <3 x float> %a)
70+
ret float %b
71+
}
72+
73+
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
74+
; CHECK-LABEL: test_v2f128:
75+
; CHECK: // %bb.0:
76+
; CHECK-NEXT: sub sp, sp, #32 // =32
77+
; CHECK-NEXT: adrp x8, .LCPI5_0
78+
; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill
79+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
80+
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
81+
; CHECK-NEXT: bl __multf3
82+
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
83+
; CHECK-NEXT: bl __multf3
84+
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
85+
; CHECK-NEXT: add sp, sp, #32 // =32
86+
; CHECK-NEXT: ret
87+
%b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a)
88+
ret fp128 %b
89+
}
90+
91+
define float @test_v16f32(<16 x float> %a) nounwind {
92+
; CHECK-LABEL: test_v16f32:
93+
; CHECK: // %bb.0:
94+
; CHECK-NEXT: fmov s4, wzr
95+
; CHECK-NEXT: fmul s4, s4, v0.s[0]
96+
; CHECK-NEXT: fmul s4, s4, v0.s[1]
97+
; CHECK-NEXT: fmul s4, s4, v0.s[2]
98+
; CHECK-NEXT: fmul s0, s4, v0.s[3]
99+
; CHECK-NEXT: fmul s0, s0, v1.s[0]
100+
; CHECK-NEXT: fmul s0, s0, v1.s[1]
101+
; CHECK-NEXT: fmul s0, s0, v1.s[2]
102+
; CHECK-NEXT: fmul s0, s0, v1.s[3]
103+
; CHECK-NEXT: fmul s0, s0, v2.s[0]
104+
; CHECK-NEXT: fmul s0, s0, v2.s[1]
105+
; CHECK-NEXT: fmul s0, s0, v2.s[2]
106+
; CHECK-NEXT: fmul s0, s0, v2.s[3]
107+
; CHECK-NEXT: fmul s0, s0, v3.s[0]
108+
; CHECK-NEXT: fmul s0, s0, v3.s[1]
109+
; CHECK-NEXT: fmul s0, s0, v3.s[2]
110+
; CHECK-NEXT: fmul s0, s0, v3.s[3]
111+
; CHECK-NEXT: ret
112+
%b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 0.0, <16 x float> %a)
113+
ret float %b
114+
}

0 commit comments

Comments
 (0)