Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
49f9f87
Add subtarget feature
rovka Jan 24, 2025
64a3d2f
[AMDGPU] ISel & PEI for whole wave functions
rovka Jan 27, 2025
47594fd
Use MF instead of MBB
rovka Mar 17, 2025
08ef43e
Revert "Add subtarget feature"
rovka Mar 11, 2025
02d7aec
Add new CC. Do nothing
rovka Mar 19, 2025
e4b378f
Replace SubtargetFeature with CallingConv
rovka Mar 11, 2025
97ba693
Enable gisel in tests
rovka Mar 17, 2025
7b44133
GISel support
rovka Mar 11, 2025
2d2f85b
Rename pseudo to match others
rovka Mar 19, 2025
fb6d20a
Rename CC
rovka Mar 25, 2025
81adaba
Fix formatting
rovka Mar 25, 2025
9931578
Update tests after merge
rovka May 6, 2025
7b75dff
Fix bug in testcase
rovka May 6, 2025
de8c395
Test inreg args
rovka May 19, 2025
5b5e137
Add docs and fixme
rovka May 20, 2025
1123374
Remove kill flags on orig exec mask
rovka Jun 17, 2025
339d2c7
Add helper to add orig exec to return
rovka Jun 23, 2025
154c430
Test with single use of orig exec
rovka Jun 23, 2025
843136a
Test calling gfx func from wwf
rovka Jun 23, 2025
9e3d9f2
Test wave64
rovka Jun 24, 2025
f6f9337
Fix a few missed spots
rovka Jun 24, 2025
639fb8c
clang-format
rovka Jun 25, 2025
f19a8df
Fix CC in test
rovka Jun 27, 2025
846aa2b
Verifier checks for whole wave funcs
rovka Jun 27, 2025
9bed239
[AMDGPU] Intrinsic for launching whole wave functions
rovka Jan 24, 2025
974e0fb
Remove Verifier check that I moved to previous PR
rovka Jun 27, 2025
8dc9461
Remove embarrassing cast
rovka Jun 27, 2025
f487cb4
Address review comments
rovka Jul 21, 2025
b104da3
Merge remote-tracking branch 'remotes/origin/main' into users/rovka/w…
rovka Jul 21, 2025
3e6b02c
Merge branch 'main' into users/rovka/whole-wave-funcs-call
rovka Jul 22, 2025
197b56d
Fixup merge mishap
rovka Jul 22, 2025
906f8ca
Merge remote-tracking branch 'remotes/origin/main' into users/rovka/w…
rovka Jul 29, 2025
7a850d8
s/size != 0/empty
rovka Jul 29, 2025
6d9c46f
Address review comments
rovka Aug 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2587,6 +2587,18 @@ def int_amdgcn_cs_chain:
],
[IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;

// Run a function with all the lanes enabled. Only direct calls are allowed. The
// first argument is the callee, which must have the `amdgpu_gfx_whole_wave`
// calling convention and must not be variadic. The remaining arguments to the
// callee are taken from the arguments passed to the intrinsic. Lanes that are
// inactive at the point of the call will receive poison. The return value is
// the return value of the callee for the active lanes (there is no return
// value in the inactive ones).
def int_amdgcn_call_whole_wave:
Intrinsic<[llvm_any_ty], // The return type of the callee.
[llvm_anyptr_ty, // The callee.
llvm_vararg_ty], // The arguments to the callee.
[IntrConvergent]>;

//===----------------------------------------------------------------------===//
// CI+ Intrinsics
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2536,6 +2536,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
return true;
case Intrinsic::amdgcn_cs_chain:
case Intrinsic::amdgcn_call_whole_wave:
return translateCallBase(CI, MIRBuilder);
case Intrinsic::fptrunc_round: {
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
Expand Down
37 changes: 37 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7982,6 +7982,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
HasTailCall = true;
return;
}
case Intrinsic::amdgcn_call_whole_wave: {
TargetLowering::ArgListTy Args;

// The first argument is the callee. Skip it when assembling the call args.
TargetLowering::ArgListEntry Arg;
for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
Arg.Node = getValue(I.getArgOperand(Idx));
Arg.Ty = I.getArgOperand(Idx)->getType();
Arg.setAttributes(&I, Idx);
Args.push_back(Arg);
}

SDValue ConvControlToken;
if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
auto *Token = Bundle->Inputs[0].get();
ConvControlToken = getValue(Token);
}

TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(getCurSDLoc())
.setChain(getRoot())
.setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
getValue(I.getArgOperand(0)), std::move(Args))
.setTailCall(false)
.setIsPreallocated(
I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
.setConvergent(I.isConvergent())
.setConvergenceControlToken(ConvControlToken);
CLI.CB = &I;

std::pair<SDValue, SDValue> Result =
lowerInvokable(CLI, /*EHPadBB=*/nullptr);

if (Result.first.getNode())
setValue(&I, Result.first);
return;
}
case Intrinsic::ptrmask: {
SDValue Ptr = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6612,6 +6612,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"Value for inactive lanes must be a VGPR function argument", &Call);
break;
}
case Intrinsic::amdgcn_call_whole_wave: {
auto F = dyn_cast<Function>(Call.getArgOperand(0));
Check(F, "Indirect whole wave calls are not allowed", &Call);

CallingConv::ID CC = F->getCallingConv();
Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
"Callee must have the amdgpu_gfx_whole_wave calling convention",
&Call);

Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);

Check(Call.arg_size() == F->arg_size(),
"Call argument count must match callee argument count", &Call);

// The first argument of the call is the callee, and the first argument of
// the callee is the active mask. The rest of the arguments must match.
Check(F->arg_begin()->getType()->isIntegerTy(1),
"Callee must have i1 as its first argument", &Call);
for (auto [CallArg, FuncArg] :
drop_begin(zip_equal(Call.args(), F->args()))) {
Check(CallArg->getType() == FuncArg.getType(),
"Argument types must match", &Call);

// Check that inreg attributes match between call site and function
Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
FuncArg.hasInRegAttr(),
"Argument inreg attributes must match", &Call);
}
break;
}
case Intrinsic::amdgcn_s_prefetch_data: {
Check(
AMDGPU::isFlatGlobalAddrSpace(
Expand Down
19 changes: 16 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1464,9 +1464,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Function *F = Info.CB->getCalledFunction())
if (F->isIntrinsic()) {
assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
"Unexpected intrinsic");
return lowerChainCall(MIRBuilder, Info);
switch (F->getIntrinsicID()) {
case Intrinsic::amdgcn_cs_chain:
return lowerChainCall(MIRBuilder, Info);
case Intrinsic::amdgcn_call_whole_wave:
Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;

// Get the callee from the original instruction, so it doesn't look like
// this is an indirect call.
Info.Callee = MachineOperand::CreateGA(
cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
Info.OrigArgs.erase(Info.OrigArgs.begin());
Info.IsVarArg = false;
break;
default:
llvm_unreachable("Unexpected intrinsic call");
}
}

if (Info.IsVarArg) {
Expand Down
174 changes: 174 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL

declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)

define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
; DAGISEL-LABEL: basic_test:
; DAGISEL: ; %bb.0:
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; DAGISEL-NEXT: s_wait_expcnt 0x0
; DAGISEL-NEXT: s_wait_samplecnt 0x0
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
; DAGISEL-NEXT: s_wait_kmcnt 0x0
; DAGISEL-NEXT: s_mov_b32 s0, s33
; DAGISEL-NEXT: s_mov_b32 s33, s32
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
; DAGISEL-NEXT: s_clause 0x1
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
; DAGISEL-NEXT: s_clause 0x1
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
; DAGISEL-NEXT: s_mov_b32 s33, s0
; DAGISEL-NEXT: s_wait_loadcnt 0x0
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: basic_test:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL-NEXT: s_wait_expcnt 0x0
; GISEL-NEXT: s_wait_samplecnt 0x0
; GISEL-NEXT: s_wait_bvhcnt 0x0
; GISEL-NEXT: s_wait_kmcnt 0x0
; GISEL-NEXT: s_mov_b32 s0, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_mov_b32 exec_lo, s1
; GISEL-NEXT: v_writelane_b32 v42, s0, 2
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
; GISEL-NEXT: scratch_store_b32 off, v41, s33
; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
; GISEL-NEXT: v_writelane_b32 v42, s30, 0
; GISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
; GISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
; GISEL-NEXT: v_writelane_b32 v42, s31, 1
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: global_store_b32 v[40:41], v0, off
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: scratch_load_b32 v41, off, s33
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
; GISEL-NEXT: v_readlane_b32 s31, v42, 1
; GISEL-NEXT: v_readlane_b32 s30, v42, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v42, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_mov_b32 exec_lo, s1
; GISEL-NEXT: s_mov_b32 s33, s0
; GISEL-NEXT: s_wait_loadcnt 0x0
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_setpc_b64 s[30:31]
%y = add i32 %x, 13
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
store i32 %ret, ptr addrspace(1) %ptr
ret void
}

declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)

define amdgpu_gfx void @ret_void(i32 %x) {
; DAGISEL-LABEL: ret_void:
; DAGISEL: ; %bb.0:
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; DAGISEL-NEXT: s_wait_expcnt 0x0
; DAGISEL-NEXT: s_wait_samplecnt 0x0
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
; DAGISEL-NEXT: s_wait_kmcnt 0x0
; DAGISEL-NEXT: s_mov_b32 s0, s33
; DAGISEL-NEXT: s_mov_b32 s33, s32
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
; DAGISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
; DAGISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
; DAGISEL-NEXT: s_mov_b32 s33, s0
; DAGISEL-NEXT: s_wait_loadcnt 0x0
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: ret_void:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL-NEXT: s_wait_expcnt 0x0
; GISEL-NEXT: s_wait_samplecnt 0x0
; GISEL-NEXT: s_wait_bvhcnt 0x0
; GISEL-NEXT: s_wait_kmcnt 0x0
; GISEL-NEXT: s_mov_b32 s0, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_mov_b32 exec_lo, s1
; GISEL-NEXT: v_writelane_b32 v40, s0, 2
; GISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
; GISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v40, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_mov_b32 exec_lo, s1
; GISEL-NEXT: s_mov_b32 s33, s0
; GISEL-NEXT: s_wait_loadcnt 0x0
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_setpc_b64 s[30:31]
call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
ret void
}

26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,29 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
%ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
ret i64 %ret
}

declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)

; Make sure we don't pass the first argument (i1).
define amdgpu_cs void @call(i32 %x, ptr %p) {
; CHECK-LABEL: name: call
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
; CHECK-NEXT: S_ENDPGM 0
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
store i32 %ret, ptr %p
ret void
}
Loading