@@ -574,13 +574,44 @@ entry:
574574define amdgpu_kernel void @add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
575575; GFX6-LABEL: add_i32_varying_vdata:
576576; GFX6: ; %bb.0: ; %entry
577- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
578- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
577+ ; GFX6-NEXT: s_mov_b64 s[0:1], exec
578+ ; GFX6-NEXT: s_mov_b32 s4, 0
579+ ; GFX6-NEXT: ; implicit-def: $vgpr1
580+ ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
581+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
582+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
583+ ; GFX6-NEXT: s_mov_b32 m0, s5
584+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
585+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
586+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
587+ ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
588+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
589+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
590+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
591+ ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
592+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
593+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
594+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
595+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
596+ ; GFX6-NEXT: ; implicit-def: $vgpr0
597+ ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
598+ ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
599+ ; GFX6-NEXT: s_cbranch_execz .LBB2_4
600+ ; GFX6-NEXT: ; %bb.3:
601+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
602+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
579603; GFX6-NEXT: s_waitcnt lgkmcnt(0)
580- ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
604+ ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
605+ ; GFX6-NEXT: .LBB2_4:
606+ ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
607+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
581608; GFX6-NEXT: s_mov_b32 s3, 0xf000
582609; GFX6-NEXT: s_mov_b32 s2, -1
583610; GFX6-NEXT: s_waitcnt vmcnt(0)
611+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
612+ ; GFX6-NEXT: s_waitcnt expcnt(0)
613+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
614+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
584615; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
585616; GFX6-NEXT: s_endpgm
586617;
@@ -937,15 +968,46 @@ entry:
937968define amdgpu_kernel void @struct_add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout , i32 %vindex ) {
938969; GFX6-LABEL: struct_add_i32_varying_vdata:
939970; GFX6: ; %bb.0: ; %entry
940- ; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11
941- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
942- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
971+ ; GFX6-NEXT: s_mov_b64 s[0:1], exec
972+ ; GFX6-NEXT: s_mov_b32 s4, 0
973+ ; GFX6-NEXT: ; implicit-def: $vgpr1
974+ ; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
975+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
976+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
977+ ; GFX6-NEXT: s_mov_b32 m0, s5
978+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
979+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
980+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
981+ ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
982+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
983+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
984+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
985+ ; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
986+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
987+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
988+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
989+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
990+ ; GFX6-NEXT: ; implicit-def: $vgpr0
991+ ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
992+ ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
993+ ; GFX6-NEXT: s_cbranch_execz .LBB3_4
994+ ; GFX6-NEXT: ; %bb.3:
995+ ; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11
996+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
997+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
943998; GFX6-NEXT: s_waitcnt lgkmcnt(0)
944- ; GFX6-NEXT: v_mov_b32_e32 v1, s8
945- ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
999+ ; GFX6-NEXT: v_mov_b32_e32 v2, s5
1000+ ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
1001+ ; GFX6-NEXT: .LBB3_4:
1002+ ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
1003+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
9461004; GFX6-NEXT: s_mov_b32 s3, 0xf000
9471005; GFX6-NEXT: s_mov_b32 s2, -1
9481006; GFX6-NEXT: s_waitcnt vmcnt(0)
1007+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
1008+ ; GFX6-NEXT: s_waitcnt expcnt(0)
1009+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
1010+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
9491011; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
9501012; GFX6-NEXT: s_endpgm
9511013;
@@ -2011,13 +2073,44 @@ entry:
20112073define amdgpu_kernel void @sub_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
20122074; GFX6-LABEL: sub_i32_varying_vdata:
20132075; GFX6: ; %bb.0: ; %entry
2014- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
2015- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2076+ ; GFX6-NEXT: s_mov_b64 s[0:1], exec
2077+ ; GFX6-NEXT: s_mov_b32 s4, 0
2078+ ; GFX6-NEXT: ; implicit-def: $vgpr1
2079+ ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
2080+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2081+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
2082+ ; GFX6-NEXT: s_mov_b32 m0, s5
2083+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
2084+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
2085+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
2086+ ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
2087+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
2088+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
2089+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
2090+ ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
2091+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
2092+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2093+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2094+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2095+ ; GFX6-NEXT: ; implicit-def: $vgpr0
2096+ ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
2097+ ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
2098+ ; GFX6-NEXT: s_cbranch_execz .LBB7_4
2099+ ; GFX6-NEXT: ; %bb.3:
2100+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
2101+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
20162102; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2017- ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
2103+ ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
2104+ ; GFX6-NEXT: .LBB7_4:
2105+ ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
2106+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
20182107; GFX6-NEXT: s_mov_b32 s3, 0xf000
20192108; GFX6-NEXT: s_mov_b32 s2, -1
20202109; GFX6-NEXT: s_waitcnt vmcnt(0)
2110+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
2111+ ; GFX6-NEXT: s_waitcnt expcnt(0)
2112+ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
2113+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
20212114; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
20222115; GFX6-NEXT: s_endpgm
20232116;
0 commit comments