22; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
33; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
44; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s
5- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX11 %s
5+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-TRUE16 %s
6+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-FAKE16 %s
67
78; FIXME: GFX9 should be producing v_mad_u16 instead of v_mad_legacy_u16.
89
@@ -65,22 +66,44 @@ define amdgpu_kernel void @mad_u16(
6566; GFX10-NEXT: global_store_short v0, v1, s[8:9]
6667; GFX10-NEXT: s_endpgm
6768;
68- ; GFX11-LABEL: mad_u16:
69- ; GFX11: ; %bb.0: ; %entry
70- ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
71- ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
72- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
73- ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
74- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
75- ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
76- ; GFX11-NEXT: s_waitcnt vmcnt(0)
77- ; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
78- ; GFX11-NEXT: s_waitcnt vmcnt(0)
79- ; GFX11-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
80- ; GFX11-NEXT: s_waitcnt vmcnt(0)
81- ; GFX11-NEXT: v_mad_u16 v0, v1, v2, v0
82- ; GFX11-NEXT: global_store_b16 v3, v0, s[0:1]
83- ; GFX11-NEXT: s_endpgm
69+ ; GFX11-TRUE16-LABEL: mad_u16:
70+ ; GFX11-TRUE16: ; %bb.0: ; %entry
71+ ; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
72+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
73+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
74+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
75+ ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
76+ ; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
77+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
78+ ; GFX11-TRUE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
79+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
80+ ; GFX11-TRUE16-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc
81+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
82+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
83+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
84+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
85+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
86+ ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
87+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
88+ ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
89+ ; GFX11-TRUE16-NEXT: s_endpgm
90+ ;
91+ ; GFX11-FAKE16-LABEL: mad_u16:
92+ ; GFX11-FAKE16: ; %bb.0: ; %entry
93+ ; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
94+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
95+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
96+ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
97+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
98+ ; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
99+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
100+ ; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
101+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
102+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
103+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
104+ ; GFX11-FAKE16-NEXT: v_mad_u16 v0, v1, v2, v0
105+ ; GFX11-FAKE16-NEXT: global_store_b16 v3, v0, s[0:1]
106+ ; GFX11-FAKE16-NEXT: s_endpgm
84107 ptr addrspace (1 ) %r ,
85108 ptr addrspace (1 ) %a ,
86109 ptr addrspace (1 ) %b ,
@@ -121,11 +144,20 @@ define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
121144; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2
122145; GFX10-NEXT: s_setpc_b64 s[30:31]
123146;
124- ; GFX11-LABEL: v_mad_u16:
125- ; GFX11: ; %bb.0:
126- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127- ; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2
128- ; GFX11-NEXT: s_setpc_b64 s[30:31]
147+ ; GFX11-TRUE16-LABEL: v_mad_u16:
148+ ; GFX11-TRUE16: ; %bb.0:
149+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
151+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
152+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
153+ ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
154+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
155+ ;
156+ ; GFX11-FAKE16-LABEL: v_mad_u16:
157+ ; GFX11-FAKE16: ; %bb.0:
158+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159+ ; GFX11-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
160+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
129161 %mul = mul i16 %arg0 , %arg1
130162 %add = add i16 %mul , %arg2
131163 ret i16 %add
@@ -151,13 +183,23 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
151183; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
152184; GFX10-NEXT: s_setpc_b64 s[30:31]
153185;
154- ; GFX11-LABEL: v_mad_u16_zext:
155- ; GFX11: ; %bb.0:
156- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157- ; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2
158- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
159- ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
160- ; GFX11-NEXT: s_setpc_b64 s[30:31]
186+ ; GFX11-TRUE16-LABEL: v_mad_u16_zext:
187+ ; GFX11-TRUE16: ; %bb.0:
188+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
190+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
191+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
192+ ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
193+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
194+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
195+ ;
196+ ; GFX11-FAKE16-LABEL: v_mad_u16_zext:
197+ ; GFX11-FAKE16: ; %bb.0:
198+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199+ ; GFX11-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
200+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
201+ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
202+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
161203 %mul = mul i16 %arg0 , %arg1
162204 %add = add i16 %mul , %arg2
163205 %zext = zext i16 %add to i32
@@ -187,13 +229,23 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
187229; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
188230; GFX10-NEXT: s_setpc_b64 s[30:31]
189231;
190- ; GFX11-LABEL: v_mad_u16_zext64:
191- ; GFX11: ; %bb.0:
192- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193- ; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2
194- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
195- ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
196- ; GFX11-NEXT: s_setpc_b64 s[30:31]
232+ ; GFX11-TRUE16-LABEL: v_mad_u16_zext64:
233+ ; GFX11-TRUE16: ; %bb.0:
234+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
236+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
237+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
238+ ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
239+ ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
240+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
241+ ;
242+ ; GFX11-FAKE16-LABEL: v_mad_u16_zext64:
243+ ; GFX11-FAKE16: ; %bb.0:
244+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245+ ; GFX11-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
246+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247+ ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
248+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
197249 %mul = mul i16 %arg0 , %arg1
198250 %add = add i16 %mul , %arg2
199251 %zext = zext i16 %add to i64
0 commit comments