Skip to content

Commit 09854ff

Browse files
committed
[AMDGPU] Re-enable atomic optimization of uniform fadd/fsub with result
1 parent f057130 commit 09854ff

File tree

4 files changed

+1458
-1447
lines changed

4 files changed

+1458
-1447
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,6 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
226226

227227
bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
228228

229-
if ((Op == AtomicRMWInst::FAdd || Op == AtomicRMWInst::FSub) &&
230-
!I.use_empty()) {
231-
// Disable the uniform return value calculation using fmul because it
232-
// mishandles infinities, NaNs and signed zeros. FIXME.
233-
ValDivergent = true;
234-
}
235-
236229
// If the value operand is divergent, each lane is contributing a different
237230
// value to the atomic calculation. We can only optimize divergent values if
238231
// we have DPP available on our subtarget, and the atomic operation is 32
@@ -995,13 +988,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
995988
break;
996989
case AtomicRMWInst::FAdd:
997990
case AtomicRMWInst::FSub: {
998-
// FIXME: This path is currently disabled in visitAtomicRMWInst because
999-
// of problems calculating the first active lane of the result (where
1000-
// Mbcnt is 0):
1001-
// - If V is infinity or NaN we will return NaN instead of BroadcastI.
1002-
// - If BroadcastI is -0.0 and V is positive we will return +0.0 instead
1003-
// of -0.0.
1004991
LaneOffset = B.CreateFMul(V, Mbcnt);
992+
// The first active lane of LaneOffset needs to be the identity (-0 for
993+
// fadd or +0 for fsub). The value we have calculated is V*0 which might
994+
// have the wrong sign or might be nan (if V is inf or nan). Correct it
995+
// with a select.
996+
// TODO: We might not need this if we can prove V is not inf or nan and
997+
// we don't care about signed zeros.
998+
// TODO: Investigate using Intrinsic::amdgcn_fmul_legacy for this.
999+
LaneOffset = B.CreateSelect(Cond, Identity, LaneOffset);
10051000
break;
10061001
}
10071002
}

llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll

Lines changed: 31 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,44 @@
44
define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
55
; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
66
; GCN: ; %bb.0:
7-
; GCN-NEXT: s_mov_b64 s[2:3], exec
8-
; GCN-NEXT: v_bfrev_b32_e32 v1, 1
9-
; GCN-NEXT: v_mov_b32_e32 v2, 4.0
10-
; GCN-NEXT: ; implicit-def: $vgpr0
11-
; GCN-NEXT: .LBB0_1: ; %ComputeLoop
12-
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
13-
; GCN-NEXT: s_ff1_i32_b64 s6, s[2:3]
14-
; GCN-NEXT: s_lshl_b64 s[4:5], 1, s6
15-
; GCN-NEXT: v_readfirstlane_b32 s7, v1
16-
; GCN-NEXT: v_readlane_b32 s8, v2, s6
17-
; GCN-NEXT: s_mov_b32 m0, s6
18-
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
19-
; GCN-NEXT: v_writelane_b32 v0, s7, m0
20-
; GCN-NEXT: s_cmp_lg_u64 s[2:3], 0
21-
; GCN-NEXT: v_add_f32_e32 v1, s8, v1
22-
; GCN-NEXT: s_cbranch_scc1 .LBB0_1
23-
; GCN-NEXT: ; %bb.2: ; %ComputeEnd
24-
; GCN-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
25-
; GCN-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
26-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
27-
; GCN-NEXT: ; implicit-def: $vgpr2
7+
; GCN-NEXT: s_mov_b64 s[6:7], exec
8+
; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
9+
; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
10+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
11+
; GCN-NEXT: ; implicit-def: $vgpr1
2812
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
29-
; GCN-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
30-
; GCN-NEXT: s_cbranch_execz .LBB0_6
31-
; GCN-NEXT: ; %bb.3:
32-
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
33-
; GCN-NEXT: s_mov_b64 s[4:5], 0
34-
; GCN-NEXT: v_mov_b32_e32 v3, 0
13+
; GCN-NEXT: s_cbranch_execz .LBB0_4
14+
; GCN-NEXT: ; %bb.1:
15+
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
16+
; GCN-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
17+
; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s1
18+
; GCN-NEXT: s_mov_b64 s[6:7], 0
19+
; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1
3520
; GCN-NEXT: s_waitcnt lgkmcnt(0)
36-
; GCN-NEXT: s_load_dword s6, s[0:1], 0x0
21+
; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
22+
; GCN-NEXT: v_mov_b32_e32 v3, 0
3723
; GCN-NEXT: s_waitcnt lgkmcnt(0)
38-
; GCN-NEXT: v_mov_b32_e32 v2, s6
39-
; GCN-NEXT: .LBB0_4: ; %atomicrmw.start
24+
; GCN-NEXT: v_mov_b32_e32 v1, s0
25+
; GCN-NEXT: .LBB0_2: ; %atomicrmw.start
4026
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
41-
; GCN-NEXT: v_mov_b32_e32 v5, v2
42-
; GCN-NEXT: v_add_f32_e32 v4, v5, v1
43-
; GCN-NEXT: global_atomic_cmpswap v2, v3, v[4:5], s[0:1] glc
27+
; GCN-NEXT: v_mov_b32_e32 v5, v1
28+
; GCN-NEXT: v_add_f32_e32 v4, v5, v2
29+
; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[4:5] glc
4430
; GCN-NEXT: s_waitcnt vmcnt(0)
4531
; GCN-NEXT: buffer_wbinvl1
46-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
47-
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
48-
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
49-
; GCN-NEXT: s_cbranch_execnz .LBB0_4
50-
; GCN-NEXT: ; %bb.5: ; %Flow
51-
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
52-
; GCN-NEXT: .LBB0_6: ; %Flow4
32+
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v5
33+
; GCN-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
34+
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
35+
; GCN-NEXT: s_cbranch_execnz .LBB0_2
36+
; GCN-NEXT: ; %bb.3: ; %Flow
37+
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
38+
; GCN-NEXT: .LBB0_4: ; %Flow2
5339
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
54-
; GCN-NEXT: v_readfirstlane_b32 s0, v2
40+
; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
41+
; GCN-NEXT: v_readfirstlane_b32 s0, v1
42+
; GCN-NEXT: v_mul_f32_e32 v0, 4.0, v0
43+
; GCN-NEXT: v_bfrev_b32_e32 v1, 1
44+
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
5545
; GCN-NEXT: v_add_f32_e32 v0, s0, v0
5646
; GCN-NEXT: global_store_dword v[0:1], v0, off
5747
; GCN-NEXT: s_endpgm

0 commit comments

Comments
 (0)