llvm · rovka · Aug 6, 2025 · Jan 24, 2025 · Jan 27, 2025 · Mar 17, 2025
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2587,6 +2587,18 @@ def int_amdgcn_cs_chain:
             ],
             [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
 
+// Run a function with all the lanes enabled. Only direct calls are allowed. The
+// first argument is the callee, which must have the `amdgpu_gfx_whole_wave`
+// calling convention and must not be variadic. The remaining arguments to the
+// callee are taken from the arguments passed to the intrinsic. Lanes that are
+// inactive at the point of the call will receive poison. The return value is
+// the return value of the callee for the active lanes (there is no return
+// value in the inactive ones).
+def int_amdgcn_call_whole_wave:
+  Intrinsic<[llvm_any_ty],    // The return type of the callee.
+            [llvm_anyptr_ty,  // The callee.
+             llvm_vararg_ty], // The arguments to the callee.
+            [IntrConvergent]>;
 
 //===----------------------------------------------------------------------===//
 // CI+ Intrinsics

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2536,6 +2536,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                          getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
     return true;
   case Intrinsic::amdgcn_cs_chain:
+  case Intrinsic::amdgcn_call_whole_wave:
     return translateCallBase(CI, MIRBuilder);
   case Intrinsic::fptrunc_round: {
     uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7982,6 +7982,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     HasTailCall = true;
     return;
   }
+  case Intrinsic::amdgcn_call_whole_wave: {
+    TargetLowering::ArgListTy Args;
+
+    // The first argument is the callee. Skip it when assembling the call args.
+    TargetLowering::ArgListEntry Arg;
+    for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
+      Arg.Node = getValue(I.getArgOperand(Idx));
+      Arg.Ty = I.getArgOperand(Idx)->getType();
+      Arg.setAttributes(&I, Idx);
+      Args.push_back(Arg);
+    }
+
+    SDValue ConvControlToken;
+    if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+      auto *Token = Bundle->Inputs[0].get();
+      ConvControlToken = getValue(Token);
+    }
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(getRoot())
+        .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
+                   getValue(I.getArgOperand(0)), std::move(Args))
+        .setTailCall(false)
+        .setIsPreallocated(
+            I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
+        .setConvergent(I.isConvergent())
+        .setConvergenceControlToken(ConvControlToken);
+    CLI.CB = &I;
+
+    std::pair<SDValue, SDValue> Result =
+        lowerInvokable(CLI, /*EHPadBB=*/nullptr);
+
+    if (Result.first.getNode())
+      setValue(&I, Result.first);
+    return;
+  }
   case Intrinsic::ptrmask: {
     SDValue Ptr = getValue(I.getOperand(0));
     SDValue Mask = getValue(I.getOperand(1));

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -6612,6 +6612,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "Value for inactive lanes must be a VGPR function argument", &Call);
     break;
   }
+  case Intrinsic::amdgcn_call_whole_wave: {
+    auto F = dyn_cast<Function>(Call.getArgOperand(0));
+    Check(F, "Indirect whole wave calls are not allowed", &Call);
+
+    CallingConv::ID CC = F->getCallingConv();
+    Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
+          "Callee must have the amdgpu_gfx_whole_wave calling convention",
+          &Call);
+
+    Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
+
+    Check(Call.arg_size() == F->arg_size(),
+          "Call argument count must match callee argument count", &Call);
+
+    // The first argument of the call is the callee, and the first argument of
+    // the callee is the active mask. The rest of the arguments must match.
+    Check(F->arg_begin()->getType()->isIntegerTy(1),
+          "Callee must have i1 as its first argument", &Call);
+    for (auto [CallArg, FuncArg] :
+         drop_begin(zip_equal(Call.args(), F->args()))) {
+      Check(CallArg->getType() == FuncArg.getType(),
+            "Argument types must match", &Call);
+
+      // Check that inreg attributes match between call site and function
+      Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
+                FuncArg.hasInRegAttr(),
+            "Argument inreg attributes must match", &Call);
+    }
+    break;
+  }
   case Intrinsic::amdgcn_s_prefetch_data: {
     Check(
         AMDGPU::isFlatGlobalAddrSpace(

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1464,9 +1464,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                    CallLoweringInfo &Info) const {
   if (Function *F = Info.CB->getCalledFunction())
     if (F->isIntrinsic()) {
-      assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
-             "Unexpected intrinsic");
-      return lowerChainCall(MIRBuilder, Info);
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::amdgcn_cs_chain:
+        return lowerChainCall(MIRBuilder, Info);
+      case Intrinsic::amdgcn_call_whole_wave:
+        Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
+
+        // Get the callee from the original instruction, so it doesn't look like
+        // this is an indirect call.
+        Info.Callee = MachineOperand::CreateGA(
+            cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
+        Info.OrigArgs.erase(Info.OrigArgs.begin());
+        Info.IsVarArg = false;
+        break;
+      default:
+        llvm_unreachable("Unexpected intrinsic call");
+      }
     }
 
   if (Info.IsVarArg) {

diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL
+
+declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)
+
+define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_mov_b32 s0, s33
+; DAGISEL-NEXT:    s_mov_b32 s33, s32
+; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT:    v_writelane_b32 v42, s0, 2
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v41, s33
+; DAGISEL-NEXT:    v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
+; DAGISEL-NEXT:    v_add_nc_u32_e32 v1, 13, v0
+; DAGISEL-NEXT:    v_writelane_b32 v42, s30, 0
+; DAGISEL-NEXT:    s_mov_b32 s1, good_callee@abs32@hi
+; DAGISEL-NEXT:    s_mov_b32 s0, good_callee@abs32@lo
+; DAGISEL-NEXT:    s_add_co_i32 s32, s32, 16
+; DAGISEL-NEXT:    v_writelane_b32 v42, s31, 1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT:    global_store_b32 v[40:41], v0, off
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_load_b32 v41, off, s33
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; DAGISEL-NEXT:    v_readlane_b32 s31, v42, 1
+; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT:    s_mov_b32 s32, s33
+; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 2
+; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT:    s_mov_b32 s33, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_mov_b32 s0, s33
+; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; GISEL-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GISEL-NEXT:    v_writelane_b32 v42, s0, 2
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v41, s33
+; GISEL-NEXT:    v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
+; GISEL-NEXT:    v_add_nc_u32_e32 v1, 13, v0
+; GISEL-NEXT:    v_writelane_b32 v42, s30, 0
+; GISEL-NEXT:    s_mov_b32 s0, good_callee@abs32@lo
+; GISEL-NEXT:    s_mov_b32 s1, good_callee@abs32@hi
+; GISEL-NEXT:    s_add_co_i32 s32, s32, 16
+; GISEL-NEXT:    v_writelane_b32 v42, s31, 1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT:    global_store_b32 v[40:41], v0, off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_load_b32 v41, off, s33
+; GISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GISEL-NEXT:    v_readlane_b32 s31, v42, 1
+; GISEL-NEXT:    v_readlane_b32 s30, v42, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
+; GISEL-NEXT:    v_readlane_b32 s0, v42, 2
+; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; GISEL-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GISEL-NEXT:    s_mov_b32 s33, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %y = add i32 %x, 13
+  %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
+  store i32 %ret, ptr addrspace(1) %ptr
+  ret void
+}
+
+declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)
+
+define amdgpu_gfx void @ret_void(i32 %x) {
+; DAGISEL-LABEL: ret_void:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_mov_b32 s0, s33
+; DAGISEL-NEXT:    s_mov_b32 s33, s32
+; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT:    v_writelane_b32 v40, s0, 2
+; DAGISEL-NEXT:    s_mov_b32 s1, void_callee@abs32@hi
+; DAGISEL-NEXT:    s_mov_b32 s0, void_callee@abs32@lo
+; DAGISEL-NEXT:    s_add_co_i32 s32, s32, 16
+; DAGISEL-NEXT:    v_writelane_b32 v40, s30, 0
+; DAGISEL-NEXT:    v_writelane_b32 v40, s31, 1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 1
+; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT:    s_mov_b32 s32, s33
+; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 2
+; DAGISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT:    s_mov_b32 s33, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_void:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_mov_b32 s0, s33
+; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GISEL-NEXT:    v_writelane_b32 v40, s0, 2
+; GISEL-NEXT:    s_mov_b32 s0, void_callee@abs32@lo
+; GISEL-NEXT:    s_mov_b32 s1, void_callee@abs32@hi
+; GISEL-NEXT:    s_add_co_i32 s32, s32, 16
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
+; GISEL-NEXT:    v_readlane_b32 s0, v40, 2
+; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; GISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GISEL-NEXT:    s_mov_b32 s33, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -101,3 +101,29 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
   %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
   ret i64 %ret
 }
+
+declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)
+
+; Make sure we don't pass the first argument (i1).
+define amdgpu_cs void @call(i32 %x, ptr %p) {
+  ; CHECK-LABEL: name: call
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK-NEXT:   [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+  ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK-NEXT:   G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
+  store i32 %ret, ptr %p
+  ret void
+}