@@ -29,10 +29,12 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
2929; SI-NEXT: s_mov_b32 s0, s4
3030; SI-NEXT: s_mov_b32 s1, s5
3131; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
32- ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
33- ; SI-NEXT: v_mov_b32_e32 v1, s11
34- ; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
35- ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
32+ ; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
33+ ; SI-NEXT: s_cselect_b64 s[4:5], 1, 0
34+ ; SI-NEXT: s_add_u32 s4, s10, s4
35+ ; SI-NEXT: s_addc_u32 s5, s11, s5
36+ ; SI-NEXT: v_mov_b32_e32 v0, s4
37+ ; SI-NEXT: v_mov_b32_e32 v1, s5
3638; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3739; SI-NEXT: s_endpgm
3840;
@@ -45,15 +47,17 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
4547; VI-NEXT: s_add_u32 s2, s6, s0
4648; VI-NEXT: v_mov_b32_e32 v2, s7
4749; VI-NEXT: s_addc_u32 s3, s7, s1
48- ; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
4950; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
50- ; VI-NEXT: v_mov_b32_e32 v3, s3
51- ; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
52- ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
53- ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
51+ ; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
5452; VI-NEXT: v_mov_b32_e32 v0, s4
53+ ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
54+ ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
55+ ; VI-NEXT: s_cselect_b64 s[0:1], 1, 0
56+ ; VI-NEXT: s_add_u32 s0, s2, s0
57+ ; VI-NEXT: s_addc_u32 s1, s3, s1
58+ ; VI-NEXT: v_mov_b32_e32 v3, s1
5559; VI-NEXT: v_mov_b32_e32 v1, s5
56- ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
60+ ; VI-NEXT: v_mov_b32_e32 v2, s0
5761; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
5862; VI-NEXT: s_endpgm
5963;
@@ -67,13 +71,15 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
6771; GFX9-NEXT: s_add_u32 s0, s6, s2
6872; GFX9-NEXT: v_mov_b32_e32 v1, s7
6973; GFX9-NEXT: s_addc_u32 s1, s7, s3
70- ; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
7174; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
75+ ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
76+ ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
77+ ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
78+ ; GFX9-NEXT: s_cselect_b64 s[2:3], 1, 0
79+ ; GFX9-NEXT: s_add_u32 s0, s0, s2
80+ ; GFX9-NEXT: s_addc_u32 s1, s1, s3
81+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
7282; GFX9-NEXT: v_mov_b32_e32 v1, s1
73- ; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
74- ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
75- ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
76- ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7783; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
7884; GFX9-NEXT: s_endpgm
7985;
@@ -87,11 +93,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
8793; GFX10-NEXT: s_add_u32 s0, s6, s2
8894; GFX10-NEXT: s_addc_u32 s1, s7, s3
8995; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
90- ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
91- ; GFX10-NEXT: s_xor_b32 s2, s2, s3
92- ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
93- ; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
94- ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
96+ ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], s[6:7]
97+ ; GFX10-NEXT: s_xor_b32 s2, s2, s6
98+ ; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
99+ ; GFX10-NEXT: s_cselect_b64 s[2:3], 1, 0
100+ ; GFX10-NEXT: s_add_u32 s0, s0, s2
101+ ; GFX10-NEXT: s_addc_u32 s1, s1, s3
102+ ; GFX10-NEXT: v_mov_b32_e32 v0, s0
103+ ; GFX10-NEXT: v_mov_b32_e32 v1, s1
95104; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
96105; GFX10-NEXT: s_endpgm
97106;
@@ -100,18 +109,20 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
100109; GFX11-NEXT: s_clause 0x1
101110; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
102111; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
103- ; GFX11-NEXT: v_mov_b32_e32 v2, 0
104112; GFX11-NEXT: s_waitcnt lgkmcnt(0)
105113; GFX11-NEXT: s_add_u32 s2, s6, s0
106114; GFX11-NEXT: s_addc_u32 s3, s7, s1
107115; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
108- ; GFX11-NEXT: v_cmp_lt_i64_e64 s1 , s[2:3], s[6:7]
116+ ; GFX11-NEXT: v_cmp_lt_i64_e64 s6 , s[2:3], s[6:7]
109117; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
110- ; GFX11-NEXT: s_xor_b32 s0, s0, s1
111- ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
112- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
113- ; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
114- ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
118+ ; GFX11-NEXT: s_xor_b32 s0, s0, s6
119+ ; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
120+ ; GFX11-NEXT: s_cselect_b64 s[0:1], 1, 0
121+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
122+ ; GFX11-NEXT: s_add_u32 s0, s2, s0
123+ ; GFX11-NEXT: s_addc_u32 s1, s3, s1
124+ ; GFX11-NEXT: v_mov_b32_e32 v0, s0
125+ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
115126; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
116127; GFX11-NEXT: s_nop 0
117128; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
0 commit comments