Skip to content

Commit f3ca3ef

Browse files
committed
[AMDGPU] Lower llvm.amdgcn.queue.ptr instrinsic to using implicit kernel argument if feasible
1 parent 4f5d866 commit f3ca3ef

File tree

7 files changed

+81
-24
lines changed

7 files changed

+81
-24
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4456,6 +4456,27 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
44564456
return true;
44574457
}
44584458

4459+
/// Legalize a value that's loaded from implicit kernel arguments.
4460+
bool AMDGPULegalizerInfo::legalizeImplicitKernelargParameterPtr(
4461+
MachineInstr &MI, MachineIRBuilder &B, LLT Ty, unsigned Offset) const {
4462+
MachineFunction &MF = *MI.getMF();
4463+
Module *M = MF.getFunction().getParent();
4464+
4465+
assert(AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5);
4466+
4467+
Register Ptr = getKernargParameterPtr(B, Offset);
4468+
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4469+
MachineMemOperand *MMO = MF.getMachineMemOperand(
4470+
PtrInfo,
4471+
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4472+
MachineMemOperand::MOInvariant,
4473+
Ty, commonAlignment(Align(Ty.getSizeInBits()), Offset));
4474+
Register Temp = B.buildLoad(Ty, Ptr, *MMO).getReg(0);
4475+
B.buildCopy(MI.getOperand(0).getReg(), Temp);
4476+
MI.eraseFromParent();
4477+
return true;
4478+
}
4479+
44594480
bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
44604481
MachineRegisterInfo &MRI,
44614482
MachineIRBuilder &B) const {
@@ -7312,9 +7333,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
73127333
case Intrinsic::amdgcn_dispatch_ptr:
73137334
return legalizePreloadedArgIntrin(MI, MRI, B,
73147335
AMDGPUFunctionArgInfo::DISPATCH_PTR);
7315-
case Intrinsic::amdgcn_queue_ptr:
7336+
case Intrinsic::amdgcn_queue_ptr: {
7337+
MachineFunction &MF = *MI.getMF();
7338+
Module *M = MF.getFunction().getParent();
7339+
if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7340+
uint64_t Offset = ST.getTargetLowering()->getImplicitParameterOffset(
7341+
B.getMF(), AMDGPUTargetLowering::QUEUE_PTR);
7342+
return legalizeImplicitKernelargParameterPtr(MI, B, S64, Offset);
7343+
}
73167344
return legalizePreloadedArgIntrin(MI, MRI, B,
73177345
AMDGPUFunctionArgInfo::QUEUE_PTR);
7346+
}
73187347
case Intrinsic::amdgcn_implicit_buffer_ptr:
73197348
return legalizePreloadedArgIntrin(
73207349
MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
131131
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
132132
uint64_t Offset,
133133
Align Alignment = Align(4)) const;
134+
bool legalizeImplicitKernelargParameterPtr(MachineInstr &MI,
135+
MachineIRBuilder &B, LLT Ty,
136+
unsigned Offset) const;
134137

135138
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
136139
MachineIRBuilder &B) const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8414,8 +8414,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
84148414
return getPreloadedValue(DAG, *MFI, VT,
84158415
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
84168416
}
8417-
case Intrinsic::amdgcn_dispatch_ptr:
84188417
case Intrinsic::amdgcn_queue_ptr: {
8418+
const Module *M = DAG.getMachineFunction().getFunction().getParent();
8419+
if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5)
8420+
return loadImplicitKernelArgument(DAG, MVT::i64, DL, Align(8), QUEUE_PTR);
8421+
[[fallthrough]];
8422+
}
8423+
case Intrinsic::amdgcn_dispatch_ptr: {
84198424
if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
84208425
DiagnosticInfoUnsupported BadIntrin(
84218426
MF.getFunction(), "unsupported hsa intrinsic without hsa target",

llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
295295
;
296296
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
297297
; GFX8V5: ; %bb.0:
298-
; GFX8V5-NEXT: s_add_u32 s0, s6, 8
299-
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
300-
; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
301-
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
298+
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
299+
; GFX8V5-NEXT: s_add_u32 s2, s6, 8
300+
; GFX8V5-NEXT: s_addc_u32 s3, s7, 0
301+
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
302302
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
303303
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
304304
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
305305
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
306+
; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
307+
; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
308+
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
309+
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
306310
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
307311
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
308312
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -336,15 +340,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
336340
;
337341
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
338342
; GFX9V5: ; %bb.0:
343+
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
339344
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
340-
; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc
341-
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
345+
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
346+
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
347+
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
348+
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
342349
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
343350
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
351+
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
344352
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
345353
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
346354
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
347-
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
348355
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
349356
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
350357
; GFX9V5-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
207207
; FIXEDABI-LABEL: marked_func_use_other_sgpr:
208208
; FIXEDABI: ; %bb.0:
209209
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210+
; FIXEDABI-NEXT: s_mov_b64 s[6:7], 0xc8
211+
; FIXEDABI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
212+
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
210213
; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
211214
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s7
212215
; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
@@ -238,12 +241,15 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
238241
define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
239242
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
240243
; FIXEDABI: ; %bb.0:
241-
; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
242-
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
243-
; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
244-
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
245-
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
246-
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
244+
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd0
245+
; FIXEDABI-NEXT: s_add_u32 s2, s4, 8
246+
; FIXEDABI-NEXT: s_addc_u32 s3, s5, 0
247+
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s2
248+
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s3
249+
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
250+
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s1
251+
; FIXEDABI-NEXT: v_mov_b32_e32 v2, s0
252+
; FIXEDABI-NEXT: flat_load_ubyte v2, v[2:3] glc
247253
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
248254
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
249255
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc

llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
2222
}
2323

2424
; GCN-LABEL: {{^}}use_queue_ptr:
25-
; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
25+
; GCN: s_mov_b64 s[4:5], 0xc8
26+
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2627
define hidden void @use_queue_ptr() #1 {
2728
%queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
2829
%value = load volatile i32, ptr addrspace(4) %queue_ptr

llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -287,20 +287,24 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
287287
;
288288
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
289289
; GFX8V5: ; %bb.0:
290-
; GFX8V5-NEXT: s_add_u32 s0, s6, 8
291-
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
292-
; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
293-
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
290+
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
291+
; GFX8V5-NEXT: s_add_u32 s2, s6, 8
292+
; GFX8V5-NEXT: s_addc_u32 s3, s7, 0
293+
; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
294+
; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
295+
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
294296
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
295297
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
296298
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
297299
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
300+
; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
301+
; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
302+
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
303+
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
298304
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
299305
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
300306
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
301307
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
302-
; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
303-
; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
304308
; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
305309
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
306310
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
@@ -327,16 +331,18 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
327331
;
328332
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
329333
; GFX9V5: ; %bb.0:
334+
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xd0
330335
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
336+
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
337+
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
338+
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
331339
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
332340
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
333341
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
334-
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
335342
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
336343
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
337344
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
338345
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
339-
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
340346
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
341347
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
342348
; GFX9V5-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)