Skip to content

Commit be06d03

Browse files
committed
[AMDGPU] Lower llvm.amdgcn.queue.ptr instrinsic to using implicit kernel argument if feasible
1 parent 29e51f8 commit be06d03

File tree

4 files changed

+94
-42
lines changed

4 files changed

+94
-42
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8414,8 +8414,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
84148414
return getPreloadedValue(DAG, *MFI, VT,
84158415
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
84168416
}
8417-
case Intrinsic::amdgcn_dispatch_ptr:
84188417
case Intrinsic::amdgcn_queue_ptr: {
8418+
const Module *M = DAG.getMachineFunction().getFunction().getParent();
8419+
if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5)
8420+
return loadImplicitKernelArgument(DAG, MVT::i64, DL, Align(8), QUEUE_PTR);
8421+
[[fallthrough]];
8422+
}
8423+
case Intrinsic::amdgcn_dispatch_ptr: {
84198424
if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
84208425
DiagnosticInfoUnsupported BadIntrin(
84218426
MF.getFunction(), "unsupported hsa intrinsic without hsa target",

llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll

Lines changed: 72 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -204,26 +204,50 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
204204
}
205205

206206
define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
207-
; FIXEDABI-LABEL: marked_func_use_other_sgpr:
208-
; FIXEDABI: ; %bb.0:
209-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210-
; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
211-
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s7
212-
; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
213-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
214-
; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
215-
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s9
216-
; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
217-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
218-
; FIXEDABI-NEXT: v_mov_b32_e32 v2, s4
219-
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s5
220-
; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
221-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
222-
; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10
223-
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s11
224-
; FIXEDABI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
225-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
226-
; FIXEDABI-NEXT: s_setpc_b64 s[30:31]
207+
; FIXEDABI-SDAG-LABEL: marked_func_use_other_sgpr:
208+
; FIXEDABI-SDAG: ; %bb.0:
209+
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210+
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
211+
; FIXEDABI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
212+
; FIXEDABI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
213+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s6
214+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s7
215+
; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
216+
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
217+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s8
218+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s9
219+
; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
220+
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
221+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s4
222+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s5
223+
; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
224+
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
225+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s10
226+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s11
227+
; FIXEDABI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
228+
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
229+
; FIXEDABI-SDAG-NEXT: s_setpc_b64 s[30:31]
230+
;
231+
; FIXEDABI-GISEL-LABEL: marked_func_use_other_sgpr:
232+
; FIXEDABI-GISEL: ; %bb.0:
233+
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s6
235+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s7
236+
; FIXEDABI-GISEL-NEXT: flat_load_ubyte v2, v[2:3] glc
237+
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
238+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s8
239+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s9
240+
; FIXEDABI-GISEL-NEXT: flat_load_ubyte v2, v[2:3] glc
241+
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
242+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s4
243+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s5
244+
; FIXEDABI-GISEL-NEXT: flat_load_ubyte v2, v[2:3] glc
245+
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
246+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v2, s10
247+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s11
248+
; FIXEDABI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
249+
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
250+
; FIXEDABI-GISEL-NEXT: s_setpc_b64 s[30:31]
227251
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
228252
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
229253
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -236,18 +260,34 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
236260
}
237261

238262
define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
239-
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
240-
; FIXEDABI: ; %bb.0:
241-
; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
242-
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
243-
; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
244-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
245-
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
246-
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
247-
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
248-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
249-
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
250-
; FIXEDABI-NEXT: s_endpgm
263+
; FIXEDABI-SDAG-LABEL: marked_kernel_use_other_sgpr:
264+
; FIXEDABI-SDAG: ; %bb.0:
265+
; FIXEDABI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd0
266+
; FIXEDABI-SDAG-NEXT: s_add_u32 s2, s4, 8
267+
; FIXEDABI-SDAG-NEXT: s_addc_u32 s3, s5, 0
268+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s2
269+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v1, s3
270+
; FIXEDABI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
271+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v3, s1
272+
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s0
273+
; FIXEDABI-SDAG-NEXT: flat_load_ubyte v2, v[2:3] glc
274+
; FIXEDABI-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
275+
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0)
276+
; FIXEDABI-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
277+
; FIXEDABI-SDAG-NEXT: s_endpgm
278+
;
279+
; FIXEDABI-GISEL-LABEL: marked_kernel_use_other_sgpr:
280+
; FIXEDABI-GISEL: ; %bb.0:
281+
; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s4, 8
282+
; FIXEDABI-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
283+
; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s5, 0
284+
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
285+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v0, s0
286+
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v1, s1
287+
; FIXEDABI-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
288+
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0)
289+
; FIXEDABI-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
290+
; FIXEDABI-GISEL-NEXT: s_endpgm
251291
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
252292
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
253293
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()

llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
2222
}
2323

2424
; GCN-LABEL: {{^}}use_queue_ptr:
25-
; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
25+
; GCN: s_mov_b64 s[4:5], 0xc8
26+
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2627
define hidden void @use_queue_ptr() #1 {
2728
%queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
2829
%value = load volatile i32, ptr addrspace(4) %queue_ptr

llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -287,20 +287,24 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
287287
;
288288
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
289289
; GFX8V5: ; %bb.0:
290-
; GFX8V5-NEXT: s_add_u32 s0, s6, 8
291-
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
292-
; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
293-
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
290+
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
291+
; GFX8V5-NEXT: s_add_u32 s2, s6, 8
292+
; GFX8V5-NEXT: s_addc_u32 s3, s7, 0
293+
; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
294+
; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
295+
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
294296
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
295297
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
296298
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
297299
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
300+
; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
301+
; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
302+
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
303+
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
298304
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
299305
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
300306
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
301307
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
302-
; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
303-
; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
304308
; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
305309
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
306310
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
@@ -327,16 +331,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
327331
;
328332
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
329333
; GFX9V5: ; %bb.0:
334+
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
330335
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
336+
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
337+
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
338+
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
331339
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
332340
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
333341
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
334-
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
335342
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
336343
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
337344
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
338345
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
339-
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
340346
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
341347
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
342348
; GFX9V5-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)