5252// | async context if needed |
5353// | (a.k.a. "frame record") |
5454// |-----------------------------------| <- fp(=x29)
55+ // | <hazard padding> |
56+ // |-----------------------------------|
5557// | |
5658// | callee-saved fp/simd/SVE regs |
5759// | |
6466// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
6567// |.the.standard.16-byte.alignment....| compile time; if present)
6668// |-----------------------------------|
67- // | |
6869// | local variables of fixed size |
6970// | including spill slots |
71+ // | <FPR> |
72+ // | <hazard padding> |
73+ // | <GPR> |
7074// |-----------------------------------| <- bp(not defined by ABI,
7175// |.variable-sized.local.variables....| LLVM chooses X19)
7276// |.(VLAs)............................| (size of this area is unknown at
117121//
118122// FIXME: also explain the redzone concept.
119123//
124+ // About stack hazards: Under some SME contexts, a coprocessor with its own
125+ // separate cache can used for FP operations. This can create hazards if the CPU
126+ // and the SME unit try to access the same area of memory, including if the
127+ // access is to an area of the stack. To try to alleviate this we attempt to
128+ // introduce extra padding into the stack frame between FP and GPR accesses,
129+ // controlled by the StackHazardSize option. Without changing the layout of the
130+ // stack frame in the diagram above, a stack object of size StackHazardSize is
131+ // added between GPR and FPR CSRs. Another is added to the stack objects
132+ // section, and stack objects are sorted so that FPR > Hazard padding slot >
133+ // GPRs (where possible). Unfortunately some things are not handled well (VLA
134+ // area, arguments on the stack, object with both GPR and FPR accesses), but if
135+ // those are controlled by the user then the entire stack frame becomes GPR at
136+ // the start/end with FPR in the middle, surrounded by Hazard padding.
137+ //
120138// An example of the prologue:
121139//
122140// .globl __foo
196214#include " llvm/ADT/ScopeExit.h"
197215#include " llvm/ADT/SmallVector.h"
198216#include " llvm/ADT/Statistic.h"
217+ #include " llvm/Analysis/ValueTracking.h"
199218#include " llvm/CodeGen/LivePhysRegs.h"
200219#include " llvm/CodeGen/MachineBasicBlock.h"
201220#include " llvm/CodeGen/MachineFrameInfo.h"
@@ -253,6 +272,14 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
253272 cl::desc (" Emit homogeneous prologue and epilogue for the size "
254273 " optimization (default = off)" ));
255274
275+ // Stack hazard padding size. 0 = disabled.
276+ static cl::opt<unsigned > StackHazardSize (" aarch64-stack-hazard-size" ,
277+ cl::init (0 ), cl::Hidden);
278+ // Whether to insert padding into non-streaming functions (for testing).
279+ static cl::opt<bool >
280+ StackHazardInNonStreaming (" aarch64-stack-hazard-in-non-streaming" ,
281+ cl::init (false ), cl::Hidden);
282+
256283STATISTIC (NumRedZoneFunctions, " Number of functions using red zone" );
257284
258285// / Returns how much of the incoming argument stack area (in bytes) we should
@@ -1461,6 +1488,10 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
14611488 // update in so create a normal arithmetic instruction instead.
14621489 if (MBBI->getOperand (MBBI->getNumOperands () - 1 ).getImm () != 0 ||
14631490 CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1491+ // If we are destroying the frame, make sure we add the increment after the
1492+ // last frame operation.
1493+ if (FrameFlag == MachineInstr::FrameDestroy)
1494+ ++MBBI;
14641495 emitFrameOffset (MBB, MBBI, DL, AArch64::SP, AArch64::SP,
14651496 StackOffset::getFixed (CSStackSizeInc), TII, FrameFlag,
14661497 false , false , nullptr , EmitCFI,
@@ -2901,6 +2932,7 @@ static void computeCalleeSaveRegisterPairs(
29012932 }
29022933 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize ();
29032934 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace ();
2935+ Register LastReg = 0 ;
29042936
29052937 // When iterating backwards, the loop condition relies on unsigned wraparound.
29062938 for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -2922,8 +2954,15 @@ static void computeCalleeSaveRegisterPairs(
29222954 else
29232955 llvm_unreachable (" Unsupported register class." );
29242956
2957+ // Add the stack hazard size as we transition from GPR->FPR CSRs.
2958+ if (AFI->hasStackHazardSlotIndex () &&
2959+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
2960+ AArch64InstrInfo::isFpOrNEON (RPI.Reg1 ))
2961+ ByteOffset += StackFillDir * StackHazardSize;
2962+ LastReg = RPI.Reg1 ;
2963+
29252964 // Add the next reg to the pair if it is in the same register class.
2926- if (unsigned (i + RegInc) < Count) {
2965+ if (unsigned (i + RegInc) < Count && !AFI-> hasStackHazardSlotIndex () ) {
29272966 Register NextReg = CSI[i + RegInc].getReg ();
29282967 bool IsFirst = i == FirstReg;
29292968 switch (RPI.Type ) {
@@ -3034,7 +3073,8 @@ static void computeCalleeSaveRegisterPairs(
30343073 Offset += 8 ;
30353074 RPI.Offset = Offset / Scale;
30363075
3037- assert (((!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
3076+ assert ((!RPI.isPaired () ||
3077+ (!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
30383078 (RPI.isScalable () && RPI.Offset >= -256 && RPI.Offset <= 255 )) &&
30393079 " Offset out of bounds for LDP/STP immediate" );
30403080
@@ -3455,6 +3495,80 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
34553495 return true ;
34563496}
34573497
3498+ // Return the FrameID for a Load/Store instruction by looking at the MMO.
3499+ static std::optional<int > getLdStFrameID (const MachineInstr &MI,
3500+ const MachineFrameInfo &MFI) {
3501+ if (!MI.mayLoadOrStore () || MI.getNumMemOperands () < 1 )
3502+ return std::nullopt ;
3503+
3504+ MachineMemOperand *MMO = *MI.memoperands_begin ();
3505+ auto *PSV =
3506+ dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue ());
3507+ if (PSV)
3508+ return std::optional<int >(PSV->getFrameIndex ());
3509+
3510+ if (MMO->getValue ()) {
3511+ if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject (MMO->getValue ()))) {
3512+ for (int FI = MFI.getObjectIndexBegin (); FI < MFI.getObjectIndexEnd ();
3513+ FI++)
3514+ if (MFI.getObjectAllocation (FI) == Al)
3515+ return FI;
3516+ }
3517+ }
3518+
3519+ return std::nullopt ;
3520+ }
3521+
3522+ // Check if a Hazard slot is needed for the current function, and if so create
3523+ // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
3524+ // which can be used to determine if any hazard padding is needed.
3525+ void AArch64FrameLowering::determineStackHazardSlot (
3526+ MachineFunction &MF, BitVector &SavedRegs) const {
3527+ if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
3528+ MF.getInfo <AArch64FunctionInfo>()->hasStackHazardSlotIndex ())
3529+ return ;
3530+
3531+ // Stack hazards are only needed in streaming functions.
3532+ SMEAttrs Attrs (MF.getFunction ());
3533+ if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody ())
3534+ return ;
3535+
3536+ MachineFrameInfo &MFI = MF.getFrameInfo ();
3537+
3538+ // Add a hazard slot if there are any CSR FPR registers, or are any fp-only
3539+ // stack objects.
3540+ bool HasFPRCSRs = any_of (SavedRegs.set_bits (), [](unsigned Reg) {
3541+ return AArch64::FPR64RegClass.contains (Reg) ||
3542+ AArch64::FPR128RegClass.contains (Reg) ||
3543+ AArch64::ZPRRegClass.contains (Reg) ||
3544+ AArch64::PPRRegClass.contains (Reg);
3545+ });
3546+ bool HasFPRStackObjects = false ;
3547+ if (!HasFPRCSRs) {
3548+ std::vector<unsigned > FrameObjects (MFI.getObjectIndexEnd ());
3549+ for (auto &MBB : MF) {
3550+ for (auto &MI : MBB) {
3551+ std::optional<int > FI = getLdStFrameID (MI, MFI);
3552+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
3553+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
3554+ FrameObjects[*FI] |= 2 ;
3555+ else
3556+ FrameObjects[*FI] |= 1 ;
3557+ }
3558+ }
3559+ }
3560+ HasFPRStackObjects =
3561+ any_of (FrameObjects, [](unsigned B) { return (B & 3 ) == 2 ; });
3562+ }
3563+
3564+ if (HasFPRCSRs || HasFPRStackObjects) {
3565+ int ID = MFI.CreateStackObject (StackHazardSize, Align (16 ), false );
3566+ LLVM_DEBUG (dbgs () << " Created Hazard slot at " << ID << " size "
3567+ << StackHazardSize << " \n " );
3568+ MF.getInfo <AArch64FunctionInfo>()->setStackHazardSlotIndex (ID);
3569+ }
3570+ }
3571+
34583572void AArch64FrameLowering::determineCalleeSaves (MachineFunction &MF,
34593573 BitVector &SavedRegs,
34603574 RegScavenger *RS) const {
@@ -3595,6 +3709,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
35953709 CSStackSize += 8 ;
35963710 }
35973711
3712+ // Determine if a Hazard slot should be used, and increase the CSStackSize by
3713+ // StackHazardSize if so.
3714+ determineStackHazardSlot (MF, SavedRegs);
3715+ if (AFI->hasStackHazardSlotIndex ())
3716+ CSStackSize += StackHazardSize;
3717+
35983718 // Save number of saved regs, so we can easily update CSStackSize later.
35993719 unsigned NumSavedRegs = SavedRegs.count ();
36003720
@@ -3761,10 +3881,28 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
37613881 CSI.insert (CSI.end (), VGSaves.begin (), VGSaves.end ());
37623882 }
37633883
3884+ Register LastReg = 0 ;
3885+ int HazardSlotIndex = std::numeric_limits<int >::max ();
37643886 for (auto &CS : CSI) {
37653887 Register Reg = CS.getReg ();
37663888 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass (Reg);
37673889
3890+ // Create a hazard slot as we switch between GPR and FPR CSRs.
3891+ if (AFI->hasStackHazardSlotIndex () &&
3892+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
3893+ AArch64InstrInfo::isFpOrNEON (Reg)) {
3894+ assert (HazardSlotIndex == std::numeric_limits<int >::max () &&
3895+ " Unexpected register order for hazard slot" );
3896+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3897+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3898+ << " \n " );
3899+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3900+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3901+ MinCSFrameIndex = HazardSlotIndex;
3902+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3903+ MaxCSFrameIndex = HazardSlotIndex;
3904+ }
3905+
37683906 unsigned Size = RegInfo->getSpillSize (*RC);
37693907 Align Alignment (RegInfo->getSpillAlign (*RC));
37703908 int FrameIdx = MFI.CreateStackObject (Size, Alignment, true );
@@ -3785,7 +3923,22 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
37853923 if ((unsigned )FrameIdx > MaxCSFrameIndex)
37863924 MaxCSFrameIndex = FrameIdx;
37873925 }
3926+ LastReg = Reg;
3927+ }
3928+
3929+ // Add hazard slot in the case where no FPR CSRs are present.
3930+ if (AFI->hasStackHazardSlotIndex () &&
3931+ HazardSlotIndex == std::numeric_limits<int >::max ()) {
3932+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3933+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3934+ << " \n " );
3935+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3936+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3937+ MinCSFrameIndex = HazardSlotIndex;
3938+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3939+ MaxCSFrameIndex = HazardSlotIndex;
37883940 }
3941+
37893942 return true ;
37903943}
37913944
@@ -3798,6 +3951,10 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
37983951 // function doesn't use a FP.
37993952 if (AFI->hasStreamingModeChanges () && !hasFP (MF))
38003953 return false ;
3954+ // Don't allow register salvaging with hazard slots, in case it moves objects
3955+ // into the wrong place.
3956+ if (AFI->hasStackHazardSlotIndex ())
3957+ return false ;
38013958 return AFI->hasCalleeSaveStackFreeSpace ();
38023959}
38033960
@@ -4492,6 +4649,11 @@ struct FrameObject {
44924649 // This object's group (which always contains the object with
44934650 // ObjectFirst==true) should be placed first.
44944651 bool GroupFirst = false ;
4652+
4653+ // Used to distinguish between FP and GPR accesses. The values are decided so
4654+ // that they sort FPR < Hazard < GPR and they can be or'd together.
4655+ unsigned Accesses = 0 ;
4656+ enum { AccessFPR = 1 , AccessHazard = 2 , AccessGPR = 4 };
44954657};
44964658
44974659class GroupBuilder {
@@ -4527,8 +4689,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
45274689 // at the end. This also allows us to stop walking when we hit the
45284690 // first invalid item after it's all sorted.
45294691 //
4530- // The "first" object goes first (closest to SP), followed by the members of
4531- // the "first" group.
4692+ // If we want to include a stack hazard region, order FPR accesses < the
4693+ // hazard object < GPRs accesses in order to create a separation between the
4694+ // two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR.
4695+ //
4696+ // Otherwise the "first" object goes first (closest to SP), followed by the
4697+ // members of the "first" group.
45324698 //
45334699 // The rest are sorted by the group index to keep the groups together.
45344700 // Higher numbered groups are more likely to be around longer (i.e. untagged
@@ -4537,10 +4703,10 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
45374703 //
45384704 // If all else equal, sort by the object index to keep the objects in the
45394705 // original order.
4540- return std::make_tuple (!A.IsValid , A.ObjectFirst , A.GroupFirst , A.GroupIndex ,
4541- A.ObjectIndex ) <
4542- std::make_tuple (!B.IsValid , B.ObjectFirst , B.GroupFirst , B.GroupIndex ,
4543- B.ObjectIndex );
4706+ return std::make_tuple (!A.IsValid , A.Accesses , A.ObjectFirst , A.GroupFirst ,
4707+ A.GroupIndex , A. ObjectIndex ) <
4708+ std::make_tuple (!B.IsValid , B.Accesses , B.ObjectFirst , B.GroupFirst ,
4709+ B.GroupIndex , B. ObjectIndex );
45444710}
45454711} // namespace
45464712
@@ -4549,19 +4715,32 @@ void AArch64FrameLowering::orderFrameObjects(
45494715 if (!OrderFrameObjects || ObjectsToAllocate.empty ())
45504716 return ;
45514717
4718+ const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
45524719 const MachineFrameInfo &MFI = MF.getFrameInfo ();
45534720 std::vector<FrameObject> FrameObjects (MFI.getObjectIndexEnd ());
45544721 for (auto &Obj : ObjectsToAllocate) {
45554722 FrameObjects[Obj].IsValid = true ;
45564723 FrameObjects[Obj].ObjectIndex = Obj;
45574724 }
45584725
4559- // Identify stack slots that are tagged at the same time.
4726+ // Identify FPR vs GPR slots for hazards, and stack slots that are tagged at
4727+ // the same time.
45604728 GroupBuilder GB (FrameObjects);
45614729 for (auto &MBB : MF) {
45624730 for (auto &MI : MBB) {
45634731 if (MI.isDebugInstr ())
45644732 continue ;
4733+
4734+ if (AFI.hasStackHazardSlotIndex ()) {
4735+ std::optional<int > FI = getLdStFrameID (MI, MFI);
4736+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
4737+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
4738+ FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
4739+ else
4740+ FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
4741+ }
4742+ }
4743+
45654744 int OpIndex;
45664745 switch (MI.getOpcode ()) {
45674746 case AArch64::STGloop:
@@ -4600,11 +4779,20 @@ void AArch64FrameLowering::orderFrameObjects(
46004779 GB.EndCurrentGroup ();
46014780 }
46024781
4782+ if (AFI.hasStackHazardSlotIndex ()) {
4783+ FrameObjects[AFI.getStackHazardSlotIndex ()].Accesses =
4784+ FrameObject::AccessHazard;
4785+ // If a stack object is unknown or both GPR and FPR, sort it into GPR.
4786+ for (auto &Obj : FrameObjects)
4787+ if (!Obj.Accesses ||
4788+ Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR))
4789+ Obj.Accesses = FrameObject::AccessGPR;
4790+ }
4791+
46034792 // If the function's tagged base pointer is pinned to a stack slot, we want to
46044793 // put that slot first when possible. This will likely place it at SP + 0,
46054794 // and save one instruction when generating the base pointer because IRG does
46064795 // not allow an immediate offset.
4607- const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
46084796 std::optional<int > TBPI = AFI.getTaggedBasePointerIndex ();
46094797 if (TBPI) {
46104798 FrameObjects[*TBPI].ObjectFirst = true ;
0 commit comments