diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 2cad8eeea33cf..b67a1c513c49f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s @@ -16,6 +18,18 @@ ; -------------------------------------------------------------------- define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -201,6 +215,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt } define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -388,6 +414,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr } define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -585,6 +623,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr } define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -764,6 +814,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p } define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -946,6 +1008,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g } define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1136,6 +1210,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g } define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1326,6 +1412,18 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g } define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1511,6 +1609,18 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ } define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1714,6 +1824,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt } define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1917,6 +2039,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor } define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2113,6 +2247,18 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno } define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2300,6 +2446,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p } define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2473,6 +2631,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ } define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2676,6 +2846,18 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a } define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2872,6 +3054,18 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p } define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3054,6 +3248,18 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( } define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3210,6 +3416,18 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ } define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3406,6 +3624,18 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr } define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3607,6 +3837,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr } define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3800,6 +4042,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add } define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4001,6 +4255,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ } define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4194,6 +4460,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu } define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4379,6 +4657,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ } define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4562,6 +4852,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; -------------------------------------------------------------------- define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4733,6 +5035,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo } define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4906,6 +5220,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi } define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5089,6 +5415,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi } define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5242,6 +5580,18 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem } define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5398,6 +5748,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f } define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5562,6 +5924,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f } define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5738,6 +6112,18 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f } define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5897,6 +6283,18 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ } define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6070,6 +6468,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f } define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6226,6 +6636,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ } define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6427,6 +6849,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr } define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6620,6 +7054,18 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt } define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6791,6 +7237,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo } define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6948,6 +7406,18 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; -------------------------------------------------------------------- define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7170,6 +7640,18 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p } define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7393,6 +7875,18 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g } define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7624,6 +8118,18 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g } define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7828,6 +8334,18 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p } define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8035,6 +8553,18 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g } define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8254,6 +8784,92 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; -------------------------------------------------------------------- define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8676,6 +9292,92 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr } define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9112,6 +9814,93 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra } define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9549,6 +10338,90 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra } define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9957,6 +10830,90 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p } define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10378,6 +11335,92 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g } define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10800,6 +11843,69 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g } define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v3.l, v5.l, v2.l +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11123,6 +12229,68 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ } define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off offset:2046 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v3.l, v5.l, v2.l +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off offset:2046 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11434,6 +12602,91 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n } define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11874,6 +13127,90 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr } define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12303,6 +13640,101 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; -------------------------------------------------------------------- define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12805,6 +14237,101 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( } define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_bitop2_b32 v3, 3, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13324,6 +14851,104 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ } define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13844,6 +15469,99 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ } define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14330,6 +16048,99 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( } define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v6, 16, v2 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v4, 3, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX1250-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14832,6 +16643,103 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ } define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX1250-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -15335,6 +17243,81 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ } define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v3 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -15747,6 +17730,79 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ } define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; +; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off offset:2046 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v3 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -16145,6 +18201,102 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ } define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; +; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB62_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_bitop2_b32 v3, 3, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX1250-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB62_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -16668,6 +18820,100 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine } define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; +; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX1250-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB63_1 +; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v6, 16, v2 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v4, 3, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX1250-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB63_1 +; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12-TRUE16: ; %bb.0: ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17178,6 +19424,18 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; -------------------------------------------------------------------- define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17409,6 +19667,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me } define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17642,6 +19912,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ } define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17879,6 +20161,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ } define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -18088,6 +20382,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory } define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -18300,6 +20606,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine } define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -18520,6 +20838,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine } define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -18756,6 +21086,18 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no } define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -18971,6 +21313,18 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin } define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19216,6 +21570,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p } define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19451,6 +21817,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a } define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19682,6 +22060,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me } define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19891,6 +22281,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory } define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -20136,6 +22538,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac } define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -20375,6 +22789,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; -------------------------------------------------------------------- define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -20751,6 +23177,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained } define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -21129,6 +23567,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ } define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -21511,6 +23961,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ } define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -21875,6 +24337,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor } define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -22242,6 +24716,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin } define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -22617,6 +25103,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin } define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -22998,6 +25496,18 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu } define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -23368,6 +25878,18 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi } define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -23744,6 +26266,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor } define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -24108,6 +26642,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr } define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -24484,6 +27030,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained } define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -24848,6 +27406,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor } define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -25224,6 +27794,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs } define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0