@@ -29,12 +29,10 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
2929; SI-NEXT: s_mov_b32 s0, s4
3030; SI-NEXT: s_mov_b32 s1, s5
3131; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
32- ; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
33- ; SI-NEXT: s_cselect_b64 s[4:5], 1, 0
34- ; SI-NEXT: s_add_u32 s4, s10, s4
35- ; SI-NEXT: s_addc_u32 s5, s11, s5
36- ; SI-NEXT: v_mov_b32_e32 v0, s4
37- ; SI-NEXT: v_mov_b32_e32 v1, s5
32+ ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
33+ ; SI-NEXT: v_mov_b32_e32 v1, s11
34+ ; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
35+ ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3836; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3937; SI-NEXT: s_endpgm
4038;
@@ -47,17 +45,15 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
4745; VI-NEXT: s_add_u32 s2, s6, s0
4846; VI-NEXT: v_mov_b32_e32 v2, s7
4947; VI-NEXT: s_addc_u32 s3, s7, s1
48+ ; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
5049; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
51- ; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
50+ ; VI-NEXT: v_mov_b32_e32 v3, s3
51+ ; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
52+ ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
53+ ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
5254; VI-NEXT: v_mov_b32_e32 v0, s4
53- ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
54- ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
55- ; VI-NEXT: s_cselect_b64 s[0:1], 1, 0
56- ; VI-NEXT: s_add_u32 s0, s2, s0
57- ; VI-NEXT: s_addc_u32 s1, s3, s1
58- ; VI-NEXT: v_mov_b32_e32 v3, s1
5955; VI-NEXT: v_mov_b32_e32 v1, s5
60- ; VI-NEXT: v_mov_b32_e32 v2, s0
56+ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
6157; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
6258; VI-NEXT: s_endpgm
6359;
@@ -71,15 +67,13 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
7167; GFX9-NEXT: s_add_u32 s0, s6, s2
7268; GFX9-NEXT: v_mov_b32_e32 v1, s7
7369; GFX9-NEXT: s_addc_u32 s1, s7, s3
70+ ; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
7471; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
75- ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
76- ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
77- ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
78- ; GFX9-NEXT: s_cselect_b64 s[2:3], 1, 0
79- ; GFX9-NEXT: s_add_u32 s0, s0, s2
80- ; GFX9-NEXT: s_addc_u32 s1, s1, s3
81- ; GFX9-NEXT: v_mov_b32_e32 v0, s0
8272; GFX9-NEXT: v_mov_b32_e32 v1, s1
73+ ; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
74+ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
75+ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
76+ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8377; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
8478; GFX9-NEXT: s_endpgm
8579;
@@ -93,14 +87,11 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
9387; GFX10-NEXT: s_add_u32 s0, s6, s2
9488; GFX10-NEXT: s_addc_u32 s1, s7, s3
9589; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
96- ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], s[6:7]
97- ; GFX10-NEXT: s_xor_b32 s2, s2, s6
98- ; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
99- ; GFX10-NEXT: s_cselect_b64 s[2:3], 1, 0
100- ; GFX10-NEXT: s_add_u32 s0, s0, s2
101- ; GFX10-NEXT: s_addc_u32 s1, s1, s3
102- ; GFX10-NEXT: v_mov_b32_e32 v0, s0
103- ; GFX10-NEXT: v_mov_b32_e32 v1, s1
90+ ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
91+ ; GFX10-NEXT: s_xor_b32 s2, s2, s3
92+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
93+ ; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
94+ ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
10495; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
10596; GFX10-NEXT: s_endpgm
10697;
@@ -109,20 +100,18 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
109100; GFX11-NEXT: s_clause 0x1
110101; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
111102; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
103+ ; GFX11-NEXT: v_mov_b32_e32 v2, 0
112104; GFX11-NEXT: s_waitcnt lgkmcnt(0)
113105; GFX11-NEXT: s_add_u32 s2, s6, s0
114106; GFX11-NEXT: s_addc_u32 s3, s7, s1
115107; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
116- ; GFX11-NEXT: v_cmp_lt_i64_e64 s6 , s[2:3], s[6:7]
108+ ; GFX11-NEXT: v_cmp_lt_i64_e64 s1 , s[2:3], s[6:7]
117109; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
118- ; GFX11-NEXT: s_xor_b32 s0, s0, s6
119- ; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
120- ; GFX11-NEXT: s_cselect_b64 s[0:1], 1, 0
121- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
122- ; GFX11-NEXT: s_add_u32 s0, s2, s0
123- ; GFX11-NEXT: s_addc_u32 s1, s3, s1
124- ; GFX11-NEXT: v_mov_b32_e32 v0, s0
125- ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
110+ ; GFX11-NEXT: s_xor_b32 s0, s0, s1
111+ ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
112+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
113+ ; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
114+ ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
126115; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
127116; GFX11-NEXT: s_nop 0
128117; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
0 commit comments