From 5d6135cd918fde8e43b653eec7b5106e3779bcf5 Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Tue, 19 Mar 2024 14:41:01 -0400 Subject: [PATCH 1/6] [AMDGPU] Add a trap lowering workaround for gfx11 On gfx11 shaders run with PRIV=1, which causes `s_trap 2` to be treated as a nop, which means it isn't a correct lowering for the trap intrinsic. As a workaround, this commit instead lowers the trap intrinsic to instructions that simulate the behavior of s_trap 2. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 + llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 2 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 10 ++- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 47 +++++++++++++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 9 +++ llvm/lib/Target/AMDGPU/SIInstructions.td | 6 ++ .../AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 40 +++++++++++ llvm/test/CodeGen/AMDGPU/trap-abis.ll | 68 +++++++++++++++++++ 11 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index bee43b6c18c88..1a4711dc06c4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5376,6 +5376,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) NODE_NAME_CASE(ENDPGM_TRAP) + NODE_NAME_CASE(SIMULATED_TRAP) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index f10a357125e56..72661a8d29f81 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -407,6 +407,9 @@ enum NodeType : unsigned { // s_endpgm, but we may want to insert it in the middle of the block. ENDPGM_TRAP, + // "s_trap 2" equivalent on hardware that does not support it. + SIMULATED_TRAP, + // Return to a shader part's epilog code. RETURN_TO_EPILOG, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 82f58ea38fd0a..702f6e67c5527 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -377,6 +377,8 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone, [SDNPHasChain]>; +def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone, + [SDNPHasChain]>; def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e55d1de01b4fd..bb79c4a376e4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6726,8 +6726,14 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( bool AMDGPULegalizerInfo::legalizeTrapHsa( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.buildInstr(AMDGPU::S_TRAP) - .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)); + if (!ST.requiresSimulatedTrap()) { + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)); + MI.eraseFromParent(); + return true; + } + + ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, MI.getDebugLoc()); MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 4da10beabe316..67868f273b91d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -443,6 +443,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; } + bool requiresSimulatedTrap() const { return getGeneration() == GFX11; } + bool supportsGetDoorbellID() const { // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. return getGeneration() >= GFX9; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 81a231f0cade6..174182a5eabba 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5409,6 +5409,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return SplitBB; } + case AMDGPU::SIMULATED_TRAP: { + assert(Subtarget->requiresSimulatedTrap()); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineBasicBlock *SplitBB = + TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); + MI.eraseFromParent(); + return SplitBB; + } default: if (TII->isImage(MI) || TII->isMUBUF(MI)) { if (!MI.mayStore()) @@ -6627,6 +6635,9 @@ SDValue SITargetLowering::lowerTrapHsa( SDLoc SL(Op); SDValue Chain = Op.getOperand(0); + if (Subtarget->requiresSimulatedTrap()) + return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain); + uint64_t TrapID = static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap); SDValue Ops[] = { Chain, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f4b21b7dfac39..0d5928cc25e3b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2026,6 +2026,53 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { } } +MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, + MachineBasicBlock &MBB, + MachineInstr &MI, + const DebugLoc &DL) const { + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); + MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock(); + MF->push_back(HaltLoop); + + // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this + // will be a nop. + BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP)) + .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)); + Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg) + .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2) + .addUse(AMDGPU::M0); + Register And0x3ff = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), And0x3ff) + .addUse(DoorbellReg) + .addImm(0x3ff); + Register SetWaveAbortBit = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit) + .addUse(And0x3ff) + .addImm(0x400); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addUse(SetWaveAbortBit); + BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG)) + .addImm(AMDGPU::SendMsg::ID_INTERRUPT); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addUse(AMDGPU::TTMP2); + BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop); + + BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5); + BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH)) + .addMBB(HaltLoop); + + if (SplitBB != &MBB) + MBB.removeSuccessor(SplitBB); + MBB.addSuccessor(HaltLoop); + HaltLoop->addSuccessor(HaltLoop); + + return SplitBB; +} + unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { switch (MI.getOpcode()) { default: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 4c5978cdc6665..b314b9b2fb513 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1194,6 +1194,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned Quantity) const override; void insertReturn(MachineBasicBlock &MBB) const; + + /// Build instructions that simulate the behavior of a `s_trap 2` instructions + /// for hardware (namely, gfx11) that runs in PRIV=1 mode. There, s_trap is + /// interpreted as a nop. + MachineBasicBlock *insertSimulatedTrap(MachineRegisterInfo &MRI, + MachineBasicBlock &MBB, + MachineInstr &MI, + const DebugLoc &DL) const; + /// Return the number of wait states that result from executing this /// instruction. static unsigned getNumWaitStates(const MachineInstr &MI); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 1c942dcefdace..3f9a048a329b2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -106,6 +106,12 @@ def ENDPGM_TRAP : SPseudoInstSI< let usesCustomInserter = 1; } +def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)], + "SIMULATED_TRAP"> { + let hasSideEffects = 1; + let usesCustomInserter = 1; +} + def ATOMIC_FENCE : SPseudoInstSI< (outs), (ins i32imm:$ordering, i32imm:$scope), [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir new file mode 100644 index 0000000000000..0795e2b68bec9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GCN %s + +--- +name: test_trap +body: | + bb.0: + ; GCN-LABEL: name: test_trap + ; GCN: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GCN-NEXT: S_TRAP 2 + ; GCN-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 + ; GCN-NEXT: $ttmp2 = S_MOV_B32 $m0 + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc + ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc + ; GCN-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] + ; GCN-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 + ; GCN-NEXT: $m0 = S_MOV_B32 $ttmp2 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: .1: + ; GCN-NEXT: successors: + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: .2: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_SETHALT 5 + ; GCN-NEXT: S_BRANCH %bb.2 + %0:_(s8) = G_CONSTANT i8 0 + %1:_(p1) = G_CONSTANT i64 0 + G_STORE %0, %1 :: (store 1, addrspace 1) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap) + G_STORE %0, %1 :: (store 1, addrspace 1) + +... diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 3cd6c98ef4b8e..8d5d752925742 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -3,6 +3,7 @@ ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803 %s ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s +; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s declare void @llvm.trap() #0 declare void @llvm.debugtrap() #1 @@ -49,6 +50,27 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1] ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; +; HSA-TRAP-GFX1100-LABEL: trap: +; HSA-TRAP-GFX1100: ; %bb.0: +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10 +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-NEXT: s_branch .LBB0_1 store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.trap() unreachable @@ -128,6 +150,37 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-NOTRAP-GFX900-NEXT: s_endpgm ; HSA-NOTRAP-GFX900-NEXT: .LBB1_2: ; %trap ; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; +; HSA-TRAP-GFX1100-LABEL: non_entry_trap: +; HSA-TRAP-GFX1100: ; %bb.0: ; %entry +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v1 +; HSA-TRAP-GFX1100-NEXT: s_cbranch_vccz .LBB1_2 +; HSA-TRAP-GFX1100-NEXT: ; %bb.1: ; %ret +; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v1, 3 +; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-NEXT: s_nop 0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; HSA-TRAP-GFX1100-NEXT: s_endpgm +; HSA-TRAP-GFX1100-NEXT: .LBB1_2: ; %trap +; HSA-TRAP-GFX1100-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10 +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-NEXT: s_branch .LBB1_3 entry: %tmp29 = load volatile i32, ptr addrspace(1) %arg0 %cmp = icmp eq i32 %tmp29, -1 @@ -197,6 +250,21 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1] ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: s_endpgm +; +; HSA-TRAP-GFX1100-LABEL: debugtrap: +; HSA-TRAP-GFX1100: ; %bb.0: +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 +; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v2, 2 +; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-NEXT: s_trap 3 +; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-NEXT: s_nop 0 +; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; HSA-TRAP-GFX1100-NEXT: s_endpgm store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.debugtrap() store volatile i32 2, ptr addrspace(1) %arg0 From 4f2c25e6e1345d890d04168a08136043c1d1047b Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Thu, 21 Mar 2024 13:23:07 -0400 Subject: [PATCH 2/6] Add an -O0 test --- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 69 +++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 8d5d752925742..dcc5fbd142c42 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -4,6 +4,7 @@ ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s +; RUN: llc %s -o - -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX1100-O0 %s declare void @llvm.trap() #0 declare void @llvm.debugtrap() #1 @@ -71,6 +72,27 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-TRAP-GFX1100-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 ; HSA-TRAP-GFX1100-NEXT: s_branch .LBB0_1 +; +; HSA-TRAP-GFX1100-O0-LABEL: trap: +; HSA-TRAP-GFX1100-O0: ; %bb.0: +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400 +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB0_1 store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.trap() unreachable @@ -181,6 +203,53 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX1100-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 ; HSA-TRAP-GFX1100-NEXT: s_sethalt 5 ; HSA-TRAP-GFX1100-NEXT: s_branch .LBB1_3 +; +; HSA-TRAP-GFX1100-O0-LABEL: non_entry_trap: +; HSA-TRAP-GFX1100-O0: ; %bb.0: ; %entry +; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b64 s[2:3], s[0:1] +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v0, s2, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v0, s3, 1 +; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 s0, -1 +; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $sgpr1 +; HSA-TRAP-GFX1100-O0-NEXT: v_cmp_eq_u32_e64 s0, v0, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_vccnz .LBB1_2 +; HSA-TRAP-GFX1100-O0-NEXT: ; %bb.1: ; %trap +; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff +; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400 +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0 +; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2 +; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB1_3 +; HSA-TRAP-GFX1100-O0-NEXT: .LBB1_2: ; %ret +; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1 +; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off ; 4-byte Folded Reload +; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6 +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 0 +; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 +; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0 +; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm +; HSA-TRAP-GFX1100-O0-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5 +; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB1_3 entry: %tmp29 = load volatile i32, ptr addrspace(1) %arg0 %cmp = icmp eq i32 %tmp29, -1 From b575811e688b5b09630da5d49cb359b8f8863da6 Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Fri, 22 Mar 2024 10:26:55 -0400 Subject: [PATCH 3/6] Address review comments --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 +++-- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 8 ++- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 +++-- .../AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 62 +++++++++++-------- 4 files changed, 59 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index bb79c4a376e4e..8caac830673ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6724,16 +6724,18 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( return true; } -bool AMDGPULegalizerInfo::legalizeTrapHsa( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - if (!ST.requiresSimulatedTrap()) { - B.buildInstr(AMDGPU::S_TRAP) - .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)); +bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + if (ST.requiresSimulatedTrap()) { + ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, + MI.getDebugLoc()); MI.eraseFromParent(); return true; } - ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, MI.getDebugLoc()); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)); MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 67868f273b91d..266255c16f192 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -23,6 +23,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/TargetParser/TargetParser.h" #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" @@ -443,7 +444,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; } - bool requiresSimulatedTrap() const { return getGeneration() == GFX11; } + // True on hardware where 's_trap 2' is treated as a nop that must be + // simulated. + bool requiresSimulatedTrap() const { + AMDGPU::IsaVersion V = AMDGPU::getIsaVersion(getCPU()); + return V.Major == 11 && V.Minor <= 3; + } bool supportsGetDoorbellID() const { // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0d5928cc25e3b..8f88a991eae96 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2035,6 +2035,9 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock(); MF->push_back(HaltLoop); + constexpr unsigned DoorbellIDMask = 0x3ff; + constexpr unsigned ECQueueWaveAbort = 0x400; + // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this // will be a nop. BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP)) @@ -2044,15 +2047,16 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2) .addUse(AMDGPU::M0); - Register And0x3ff = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), And0x3ff) + Register DoorbellRegMasked = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked) .addUse(DoorbellReg) - .addImm(0x3ff); + .addImm(DoorbellIDMask); Register SetWaveAbortBit = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit) - .addUse(And0x3ff) - .addImm(0x400); + .addUse(DoorbellRegMasked) + .addImm(ECQueueWaveAbort); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addUse(SetWaveAbortBit); BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir index 0795e2b68bec9..de3fedaf14f1f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -1,36 +1,44 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -# RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s --- name: test_trap body: | bb.0: - ; GCN-LABEL: name: test_trap - ; GCN: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 - ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) - ; GCN-NEXT: S_TRAP 2 - ; GCN-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 - ; GCN-NEXT: $ttmp2 = S_MOV_B32 $m0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc - ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc - ; GCN-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] - ; GCN-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 - ; GCN-NEXT: $m0 = S_MOV_B32 $ttmp2 - ; GCN-NEXT: S_BRANCH %bb.2 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: .1: - ; GCN-NEXT: successors: - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: .2: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_SETHALT 5 - ; GCN-NEXT: S_BRANCH %bb.2 + ; GFX1100-LABEL: name: test_trap + ; GFX1100: successors: %bb.2(0x80000000) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: S_TRAP 2 + ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128 + ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0 + ; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc + ; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc + ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]] + ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0 + ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2 + ; GFX1100-NEXT: S_BRANCH %bb.2 + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: .1: + ; GFX1100-NEXT: successors: + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: .2: + ; GFX1100-NEXT: successors: %bb.2(0x80000000) + ; GFX1100-NEXT: {{ $}} + ; GFX1100-NEXT: S_SETHALT 5 + ; GFX1100-NEXT: S_BRANCH %bb.2 + ; + ; GFX1150-LABEL: name: test_trap + ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 + ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) + ; GFX1150-NEXT: S_TRAP 2 + ; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1) %0:_(s8) = G_CONSTANT i8 0 %1:_(p1) = G_CONSTANT i64 0 G_STORE %0, %1 :: (store 1, addrspace 1) From 258a604d7614d0a5c09385bb5e7ab7fa064c37ad Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Mon, 25 Mar 2024 10:13:08 -0400 Subject: [PATCH 4/6] Use a feature, switch to G_TRAP --- llvm/lib/Target/AMDGPU/AMDGPU.td | 9 ++++++++- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 +++- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 11 +++-------- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 ++++-- .../CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 2 +- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 37dcfef3b2a3d..2d9916b6ec01f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -303,6 +303,12 @@ def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug", "MSAA loads not honoring dst_sel bug" >; +def FeaturePrivEnabledBug : SubtargetFeature<"priv-enabled-bug", + "HasPrivEnabledBug", + "true", + "Hardware that runs with PRIV=1 workaround" +>; + class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -1489,7 +1495,8 @@ def FeatureISAVersion11_0_Common : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureMSAALoadDstSelBug, FeatureVALUTransUseHazard, - FeatureMADIntraFwdBug])>; + FeatureMADIntraFwdBug, + FeaturePrivEnabledBug])>; def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_0_Common.Features, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8caac830673ff..deba02e19667d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6727,7 +6727,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - if (ST.requiresSimulatedTrap()) { + // We need to simulate the 's_trap 2' instruction on targets that run in + // PRIV=1 (where it is treated as a nop). + if (ST.hasPrivEnabledBug()) { ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, MI.getDebugLoc()); MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 266255c16f192..f230023a05f27 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -23,7 +23,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/TargetParser/TargetParser.h" #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" @@ -224,6 +223,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasImageStoreD16Bug = false; bool HasImageGather4D16Bug = false; bool HasMSAALoadDstSelBug = false; + bool HasPrivEnabledBug = false; bool Has1_5xVGPRs = false; bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; @@ -444,13 +444,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; } - // True on hardware where 's_trap 2' is treated as a nop that must be - // simulated. - bool requiresSimulatedTrap() const { - AMDGPU::IsaVersion V = AMDGPU::getIsaVersion(getCPU()); - return V.Major == 11 && V.Minor <= 3; - } - bool supportsGetDoorbellID() const { // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. return getGeneration() >= GFX9; @@ -1034,6 +1027,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } + bool hasPrivEnabledBug() const { return HasPrivEnabledBug; } + bool hasNSAEncoding() const { return HasNSAEncoding; } bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 174182a5eabba..003be15fb73df 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5410,7 +5410,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return SplitBB; } case AMDGPU::SIMULATED_TRAP: { - assert(Subtarget->requiresSimulatedTrap()); + assert(Subtarget->hasPrivEnabledBug()); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); MachineBasicBlock *SplitBB = TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); @@ -6635,7 +6635,9 @@ SDValue SITargetLowering::lowerTrapHsa( SDLoc SL(Op); SDValue Chain = Op.getOperand(0); - if (Subtarget->requiresSimulatedTrap()) + // We need to simulate the 's_trap 2' instruction on targets that run in + // PRIV=1 (where it is treated as a nop). + if (Subtarget->hasPrivEnabledBug()) return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain); uint64_t TrapID = static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir index de3fedaf14f1f..703a94bb48470 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -42,7 +42,7 @@ body: | %0:_(s8) = G_CONSTANT i8 0 %1:_(p1) = G_CONSTANT i64 0 G_STORE %0, %1 :: (store 1, addrspace 1) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap) + G_TRAP G_STORE %0, %1 :: (store 1, addrspace 1) ... From ecf82cb0141e87c1e19544dbb4fc4b396507e227 Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Thu, 18 Apr 2024 09:55:39 -0400 Subject: [PATCH 5/6] Use a more descriptive name for the feature --- llvm/lib/Target/AMDGPU/AMDGPU.td | 8 ++++---- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 ++-- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 2d9916b6ec01f..7dc01e4e49a28 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -303,10 +303,10 @@ def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug", "MSAA loads not honoring dst_sel bug" >; -def FeaturePrivEnabledBug : SubtargetFeature<"priv-enabled-bug", - "HasPrivEnabledBug", +def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug", + "HasPrivEnabledTrap2NopBug", "true", - "Hardware that runs with PRIV=1 workaround" + "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug" >; class SubtargetFeatureLDSBankCount : SubtargetFeature < @@ -1496,7 +1496,7 @@ def FeatureISAVersion11_0_Common : FeatureSet< [FeatureMSAALoadDstSelBug, FeatureVALUTransUseHazard, FeatureMADIntraFwdBug, - FeaturePrivEnabledBug])>; + FeaturePrivEnabledTrap2NopBug])>; def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_0_Common.Features, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index deba02e19667d..59345c3b0146e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6729,7 +6729,7 @@ bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI, MachineIRBuilder &B) const { // We need to simulate the 's_trap 2' instruction on targets that run in // PRIV=1 (where it is treated as a nop). - if (ST.hasPrivEnabledBug()) { + if (ST.hasPrivEnabledTrap2NopBug()) { ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, MI.getDebugLoc()); MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f230023a05f27..a260d694c4382 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -223,7 +223,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasImageStoreD16Bug = false; bool HasImageGather4D16Bug = false; bool HasMSAALoadDstSelBug = false; - bool HasPrivEnabledBug = false; + bool HasPrivEnabledTrap2NopBug = false; bool Has1_5xVGPRs = false; bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; @@ -1027,7 +1027,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } - bool hasPrivEnabledBug() const { return HasPrivEnabledBug; } + bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; } bool hasNSAEncoding() const { return HasNSAEncoding; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 003be15fb73df..d03a0070ebf78 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5410,7 +5410,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return SplitBB; } case AMDGPU::SIMULATED_TRAP: { - assert(Subtarget->hasPrivEnabledBug()); + assert(Subtarget->hasPrivEnabledTrap2NopBug()); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); MachineBasicBlock *SplitBB = TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); @@ -6637,7 +6637,7 @@ SDValue SITargetLowering::lowerTrapHsa( // We need to simulate the 's_trap 2' instruction on targets that run in // PRIV=1 (where it is treated as a nop). - if (Subtarget->hasPrivEnabledBug()) + if (Subtarget->hasPrivEnabledTrap2NopBug()) return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain); uint64_t TrapID = static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap); From ab773ebad8f44104d5f224fc2a81b5ac9b021d10 Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Mon, 22 Apr 2024 21:19:25 -0400 Subject: [PATCH 6/6] Add feature to gfx11-generic --- llvm/lib/Target/AMDGPU/AMDGPU.td | 1 + llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7dc01e4e49a28..acf0322470f5b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1489,6 +1489,7 @@ def FeatureISAVersion11_Generic: FeatureSet< [FeatureMSAALoadDstSelBug, FeatureVALUTransUseHazard, FeatureUserSGPRInit16Bug, + FeaturePrivEnabledTrap2NopBug, FeatureRequiresCOV6])>; def FeatureISAVersion11_0_Common : FeatureSet< diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir index 703a94bb48470..ac98dca00be3d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s ---