llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Lines changed: 8 additions & 13 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Lines changed: 8 additions & 13 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll
Lines changed: 31 additions & 41 deletions b/‎llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll
Lines changed: 31 additions & 41 deletions
@@ -226,13 +226,6 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
 
   bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
 
-  if ((Op == AtomicRMWInst::FAdd || Op == AtomicRMWInst::FSub) &&
-      !I.use_empty()) {
-    // Disable the uniform return value calculation using fmul because it
-    // mishandles infinities, NaNs and signed zeros. FIXME.
-    ValDivergent = true;
-  }
-
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
@@ -995,13 +988,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
         break;
       case AtomicRMWInst::FAdd:
       case AtomicRMWInst::FSub: {
-        // FIXME: This path is currently disabled in visitAtomicRMWInst because
-        // of problems calculating the first active lane of the result (where
-        // Mbcnt is 0):
-        // - If V is infinity or NaN we will return NaN instead of BroadcastI.
-        // - If BroadcastI is -0.0 and V is positive we will return +0.0 instead
-        //   of -0.0.
         LaneOffset = B.CreateFMul(V, Mbcnt);
+        // The first active lane of LaneOffset needs to be the identity (-0 for
+        // fadd or +0 for fsub). The value we have calculated is V*0 which might
+        // have the wrong sign or might be nan (if V is inf or nan). Correct it
+        // with a select.
+        // TODO: We might not need this if we can prove V is not inf or nan and
+        // we don't care about signed zeros.
+        // TODO: Investigate using Intrinsic::amdgcn_fmul_legacy for this.
+        LaneOffset = B.CreateSelect(Cond, Identity, LaneOffset);
         break;
       }
       }
 
@@ -4,54 +4,44 @@
 define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
 ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b64 s[2:3], exec
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_mov_b32_e32 v2, 4.0
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:  .LBB0_1: ; %ComputeLoop
-; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_ff1_i32_b64 s6, s[2:3]
-; GCN-NEXT:    s_lshl_b64 s[4:5], 1, s6
-; GCN-NEXT:    v_readfirstlane_b32 s7, v1
-; GCN-NEXT:    v_readlane_b32 s8, v2, s6
-; GCN-NEXT:    s_mov_b32 m0, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    v_writelane_b32 v0, s7, m0
-; GCN-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GCN-NEXT:    v_add_f32_e32 v1, s8, v1
-; GCN-NEXT:    s_cbranch_scc1 .LBB0_1
-; GCN-NEXT:  ; %bb.2: ; %ComputeEnd
-; GCN-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GCN-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    ; implicit-def: $vgpr2
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    ; implicit-def: $vgpr1
 ; GCN-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GCN-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GCN-NEXT:    s_cbranch_execz .LBB0_6
-; GCN-NEXT:  ; %bb.3:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_cbranch_execz .LBB0_4
+; GCN-NEXT:  ; %bb.1:
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GCN-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
+; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v1, s1
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:  .LBB0_4: ; %atomicrmw.start
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_mov_b32_e32 v5, v2
-; GCN-NEXT:    v_add_f32_e32 v4, v5, v1
-; GCN-NEXT:    global_atomic_cmpswap v2, v3, v[4:5], s[0:1] glc
+; GCN-NEXT:    v_mov_b32_e32 v5, v1
+; GCN-NEXT:    v_add_f32_e32 v4, v5, v2
+; GCN-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[4:5] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
-; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB0_4
-; GCN-NEXT:  ; %bb.5: ; %Flow
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:  .LBB0_6: ; %Flow4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v5
+; GCN-NEXT:    s_or_b64 s[6:7], s[0:1], s[6:7]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execnz .LBB0_2
+; GCN-NEXT:  ; %bb.3: ; %Flow
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:  .LBB0_4: ; %Flow2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v2
+; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_add_f32_e32 v0, s0, v0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_endpgm