@@ -571,13 +571,44 @@ entry:
571571define amdgpu_kernel void @add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
572572; GFX6-LABEL: add_i32_varying_vdata:
573573; GFX6: ; %bb.0: ; %entry
574- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
575- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
574+ ; GFX6-NEXT: s_mov_b64 s[2:3], exec
575+ ; GFX6-NEXT: s_mov_b32 s4, 0
576+ ; GFX6-NEXT: ; implicit-def: $vgpr1
577+ ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
578+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
579+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
580+ ; GFX6-NEXT: s_mov_b32 m0, s5
581+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
582+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
583+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
584+ ; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
585+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
586+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
587+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
588+ ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
589+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
590+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
591+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
592+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
593+ ; GFX6-NEXT: ; implicit-def: $vgpr0
594+ ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
595+ ; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
596+ ; GFX6-NEXT: s_cbranch_execz .LBB2_4
597+ ; GFX6-NEXT: ; %bb.3:
598+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
599+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
576600; GFX6-NEXT: s_waitcnt lgkmcnt(0)
577- ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
601+ ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
602+ ; GFX6-NEXT: .LBB2_4:
603+ ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
604+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
578605; GFX6-NEXT: s_mov_b32 s3, 0xf000
579606; GFX6-NEXT: s_mov_b32 s2, -1
580607; GFX6-NEXT: s_waitcnt vmcnt(0)
608+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
609+ ; GFX6-NEXT: s_waitcnt expcnt(0)
610+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
611+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
581612; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
582613; GFX6-NEXT: s_endpgm
583614;
@@ -924,15 +955,46 @@ entry:
924955define amdgpu_kernel void @struct_add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout , i32 %vindex ) {
925956; GFX6-LABEL: struct_add_i32_varying_vdata:
926957; GFX6: ; %bb.0: ; %entry
927- ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11
928- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
929- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
958+ ; GFX6-NEXT: s_mov_b64 s[2:3], exec
959+ ; GFX6-NEXT: s_mov_b32 s4, 0
960+ ; GFX6-NEXT: ; implicit-def: $vgpr1
961+ ; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
962+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
963+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
964+ ; GFX6-NEXT: s_mov_b32 m0, s5
965+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
966+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
967+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
968+ ; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
969+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
970+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
971+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
972+ ; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
973+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
974+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
975+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
976+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
977+ ; GFX6-NEXT: ; implicit-def: $vgpr0
978+ ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
979+ ; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
980+ ; GFX6-NEXT: s_cbranch_execz .LBB3_4
981+ ; GFX6-NEXT: ; %bb.3:
982+ ; GFX6-NEXT: s_load_dword s5, s[0:1], 0x11
983+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
984+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
930985; GFX6-NEXT: s_waitcnt lgkmcnt(0)
931- ; GFX6-NEXT: v_mov_b32_e32 v1, s2
932- ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
986+ ; GFX6-NEXT: v_mov_b32_e32 v2, s5
987+ ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
988+ ; GFX6-NEXT: .LBB3_4:
989+ ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
990+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
933991; GFX6-NEXT: s_mov_b32 s3, 0xf000
934992; GFX6-NEXT: s_mov_b32 s2, -1
935993; GFX6-NEXT: s_waitcnt vmcnt(0)
994+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
995+ ; GFX6-NEXT: s_waitcnt expcnt(0)
996+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
997+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
936998; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
937999; GFX6-NEXT: s_endpgm
9381000;
@@ -1953,13 +2015,44 @@ entry:
19532015define amdgpu_kernel void @sub_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
19542016; GFX6-LABEL: sub_i32_varying_vdata:
19552017; GFX6: ; %bb.0: ; %entry
1956- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
1957- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2018+ ; GFX6-NEXT: s_mov_b64 s[2:3], exec
2019+ ; GFX6-NEXT: s_mov_b32 s4, 0
2020+ ; GFX6-NEXT: ; implicit-def: $vgpr1
2021+ ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
2022+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2023+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
2024+ ; GFX6-NEXT: s_mov_b32 m0, s5
2025+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
2026+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
2027+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
2028+ ; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
2029+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
2030+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
2031+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
2032+ ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
2033+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
2034+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2035+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2036+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2037+ ; GFX6-NEXT: ; implicit-def: $vgpr0
2038+ ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
2039+ ; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
2040+ ; GFX6-NEXT: s_cbranch_execz .LBB7_4
2041+ ; GFX6-NEXT: ; %bb.3:
2042+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
2043+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
19582044; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1959- ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
2045+ ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
2046+ ; GFX6-NEXT: .LBB7_4:
2047+ ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
2048+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
19602049; GFX6-NEXT: s_mov_b32 s3, 0xf000
19612050; GFX6-NEXT: s_mov_b32 s2, -1
19622051; GFX6-NEXT: s_waitcnt vmcnt(0)
2052+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
2053+ ; GFX6-NEXT: s_waitcnt expcnt(0)
2054+ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
2055+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
19632056; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
19642057; GFX6-NEXT: s_endpgm
19652058;
0 commit comments