5252// | async context if needed |
5353// | (a.k.a. "frame record") |
5454// |-----------------------------------| <- fp(=x29)
55+ // | <hazard padding> |
56+ // |-----------------------------------|
5557// | |
5658// | callee-saved fp/simd/SVE regs |
5759// | |
6466// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
6567// |.the.standard.16-byte.alignment....| compile time; if present)
6668// |-----------------------------------|
67- // | |
6869// | local variables of fixed size |
6970// | including spill slots |
71+ // | <FPR> |
72+ // | <hazard padding> |
73+ // | <GPR> |
7074// |-----------------------------------| <- bp(not defined by ABI,
7175// |.variable-sized.local.variables....| LLVM chooses X19)
7276// |.(VLAs)............................| (size of this area is unknown at
117121//
118122// FIXME: also explain the redzone concept.
119123//
124+ // About stack hazards: Under some SME contexts, a coprocessor with its own
125+ // separate cache can used for FP operations. This can create hazards if the CPU
126+ // and the SME unit try to access the same area of memory, including if the
127+ // access is to an area of the stack. To try to alleviate this we attempt to
128+ // introduce extra padding into the stack frame between FP and GPR accesses,
129+ // controlled by the StackHazardSize option. Without changing the layout of the
130+ // stack frame in the diagram above, a stack object of size StackHazardSize is
131+ // added between GPR and FPR CSRs. Another is added to the stack objects
132+ // section, and stack objects are sorted so that FPR > Hazard padding slot >
133+ // GPRs (where possible). Unfortunately some things are not handled well (VLA
134+ // area, arguments on the stack, object with both GPR and FPR accesses), but if
135+ // those are controlled by the user then the entire stack frame becomes GPR at
136+ // the start/end with FPR in the middle, surrounded by Hazard padding.
137+ //
120138// An example of the prologue:
121139//
122140// .globl __foo
196214#include " llvm/ADT/ScopeExit.h"
197215#include " llvm/ADT/SmallVector.h"
198216#include " llvm/ADT/Statistic.h"
217+ #include " llvm/Analysis/ValueTracking.h"
199218#include " llvm/CodeGen/LivePhysRegs.h"
200219#include " llvm/CodeGen/MachineBasicBlock.h"
201220#include " llvm/CodeGen/MachineFrameInfo.h"
@@ -253,6 +272,14 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
253272 cl::desc (" Emit homogeneous prologue and epilogue for the size "
254273 " optimization (default = off)" ));
255274
275+ // Stack hazard padding size. 0 = disabled.
276+ static cl::opt<unsigned > StackHazardSize (" aarch64-stack-hazard-size" ,
277+ cl::init (0 ), cl::Hidden);
278+ // Whether to insert padding into non-streaming functions (for testing).
279+ static cl::opt<bool >
280+ StackHazardInNonStreaming (" aarch64-stack-hazard-in-non-streaming" ,
281+ cl::init (false ), cl::Hidden);
282+
256283STATISTIC (NumRedZoneFunctions, " Number of functions using red zone" );
257284
258285// / Returns how much of the incoming argument stack area (in bytes) we should
@@ -1461,6 +1488,10 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
14611488 // update in so create a normal arithmetic instruction instead.
14621489 if (MBBI->getOperand (MBBI->getNumOperands () - 1 ).getImm () != 0 ||
14631490 CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1491+ // If we are destroying the frame, make sure we add the increment after the
1492+ // last frame operation.
1493+ if (FrameFlag == MachineInstr::FrameDestroy)
1494+ ++MBBI;
14641495 emitFrameOffset (MBB, MBBI, DL, AArch64::SP, AArch64::SP,
14651496 StackOffset::getFixed (CSStackSizeInc), TII, FrameFlag,
14661497 false , false , nullptr , EmitCFI,
@@ -2901,6 +2932,7 @@ static void computeCalleeSaveRegisterPairs(
29012932 }
29022933 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize ();
29032934 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace ();
2935+ Register LastReg = 0 ;
29042936
29052937 // When iterating backwards, the loop condition relies on unsigned wraparound.
29062938 for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -2922,8 +2954,15 @@ static void computeCalleeSaveRegisterPairs(
29222954 else
29232955 llvm_unreachable (" Unsupported register class." );
29242956
2957+ // Add the stack hazard size as we transition from GPR->FPR CSRs.
2958+ if (AFI->hasStackHazardSlotIndex () &&
2959+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
2960+ AArch64InstrInfo::isFpOrNEON (RPI.Reg1 ))
2961+ ByteOffset += StackFillDir * StackHazardSize;
2962+ LastReg = RPI.Reg1 ;
2963+
29252964 // Add the next reg to the pair if it is in the same register class.
2926- if (unsigned (i + RegInc) < Count) {
2965+ if (unsigned (i + RegInc) < Count && !AFI-> hasStackHazardSlotIndex () ) {
29272966 Register NextReg = CSI[i + RegInc].getReg ();
29282967 bool IsFirst = i == FirstReg;
29292968 switch (RPI.Type ) {
@@ -3034,7 +3073,8 @@ static void computeCalleeSaveRegisterPairs(
30343073 Offset += 8 ;
30353074 RPI.Offset = Offset / Scale;
30363075
3037- assert (((!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
3076+ assert ((!RPI.isPaired () ||
3077+ (!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
30383078 (RPI.isScalable () && RPI.Offset >= -256 && RPI.Offset <= 255 )) &&
30393079 " Offset out of bounds for LDP/STP immediate" );
30403080
@@ -3455,6 +3495,81 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
34553495 return true ;
34563496}
34573497
3498+ // Return the FrameID for a Load/Store instruction by looking at the MMO.
3499+ static std::optional<int > getLdStFrameID (const MachineInstr &MI,
3500+ const MachineFrameInfo &MFI) {
3501+ if (!MI.mayLoadOrStore () || MI.getNumMemOperands () < 1 )
3502+ return std::nullopt ;
3503+
3504+ MachineMemOperand *MMO = *MI.memoperands_begin ();
3505+ auto *PSV =
3506+ dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue ());
3507+ if (PSV)
3508+ return std::optional<int >(PSV->getFrameIndex ());
3509+
3510+ if (MMO->getValue ()) {
3511+ if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject (MMO->getValue ()))) {
3512+ for (int FI = MFI.getObjectIndexBegin (); FI < MFI.getObjectIndexEnd ();
3513+ FI++)
3514+ if (MFI.getObjectAllocation (FI) == Al)
3515+ return FI;
3516+ }
3517+ }
3518+
3519+ return std::nullopt ;
3520+ }
3521+
3522+ // Check if a Hazard slot is needed for the current function, and if so create
3523+ // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
3524+ // which can be used to determine if any hazard padding is needed.
3525+ void AArch64FrameLowering::determineStackHazardSlot (
3526+ MachineFunction &MF, BitVector &SavedRegs) const {
3527+ if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
3528+ MF.getInfo <AArch64FunctionInfo>()->hasStackHazardSlotIndex ())
3529+ return ;
3530+
3531+ // Stack hazards are only needed in streaming functions.
3532+ SMEAttrs Attrs (MF.getFunction ());
3533+ if (!StackHazardInNonStreaming &&
3534+ Attrs.hasNonStreamingInterfaceAndBody ())
3535+ return ;
3536+
3537+ MachineFrameInfo &MFI = MF.getFrameInfo ();
3538+
3539+ // Add a hazard slot if there are any CSR FPR registers, or are any fp-only
3540+ // stack objects.
3541+ bool HasFPRCSRs = any_of (SavedRegs.set_bits (), [](unsigned Reg) {
3542+ return AArch64::FPR64RegClass.contains (Reg) ||
3543+ AArch64::FPR128RegClass.contains (Reg) ||
3544+ AArch64::ZPRRegClass.contains (Reg) ||
3545+ AArch64::PPRRegClass.contains (Reg);
3546+ });
3547+ bool HasFPRStackObjects = false ;
3548+ if (!HasFPRCSRs) {
3549+ std::vector<unsigned > FrameObjects (MFI.getObjectIndexEnd ());
3550+ for (auto &MBB : MF) {
3551+ for (auto &MI : MBB) {
3552+ std::optional<int > FI = getLdStFrameID (MI, MFI);
3553+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
3554+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
3555+ FrameObjects[*FI] |= 2 ;
3556+ else
3557+ FrameObjects[*FI] |= 1 ;
3558+ }
3559+ }
3560+ }
3561+ HasFPRStackObjects =
3562+ any_of (FrameObjects, [](unsigned B) { return (B & 3 ) == 2 ; });
3563+ }
3564+
3565+ if (HasFPRCSRs || HasFPRStackObjects) {
3566+ int ID = MFI.CreateStackObject (StackHazardSize, Align (16 ), false );
3567+ LLVM_DEBUG (dbgs () << " Created Hazard slot at " << ID << " size "
3568+ << StackHazardSize << " \n " );
3569+ MF.getInfo <AArch64FunctionInfo>()->setStackHazardSlotIndex (ID);
3570+ }
3571+ }
3572+
34583573void AArch64FrameLowering::determineCalleeSaves (MachineFunction &MF,
34593574 BitVector &SavedRegs,
34603575 RegScavenger *RS) const {
@@ -3595,6 +3710,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
35953710 CSStackSize += 8 ;
35963711 }
35973712
3713+ // Determine if a Hazard slot should be used, and increase the CSStackSize by
3714+ // StackHazardSize if so.
3715+ determineStackHazardSlot (MF, SavedRegs);
3716+ if (AFI->hasStackHazardSlotIndex ())
3717+ CSStackSize += StackHazardSize;
3718+
35983719 // Save number of saved regs, so we can easily update CSStackSize later.
35993720 unsigned NumSavedRegs = SavedRegs.count ();
36003721
@@ -3761,10 +3882,28 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
37613882 CSI.insert (CSI.end (), VGSaves.begin (), VGSaves.end ());
37623883 }
37633884
3885+ Register LastReg = 0 ;
3886+ int HazardSlotIndex = std::numeric_limits<int >::max ();
37643887 for (auto &CS : CSI) {
37653888 Register Reg = CS.getReg ();
37663889 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass (Reg);
37673890
3891+ // Create a hazard slot as we switch between GPR and FPR CSRs.
3892+ if (AFI->hasStackHazardSlotIndex () &&
3893+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
3894+ AArch64InstrInfo::isFpOrNEON (Reg)) {
3895+ assert (HazardSlotIndex == std::numeric_limits<int >::max () &&
3896+ " Unexpected register order for hazard slot" );
3897+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3898+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3899+ << " \n " );
3900+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3901+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3902+ MinCSFrameIndex = HazardSlotIndex;
3903+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3904+ MaxCSFrameIndex = HazardSlotIndex;
3905+ }
3906+
37683907 unsigned Size = RegInfo->getSpillSize (*RC);
37693908 Align Alignment (RegInfo->getSpillAlign (*RC));
37703909 int FrameIdx = MFI.CreateStackObject (Size, Alignment, true );
@@ -3785,7 +3924,22 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
37853924 if ((unsigned )FrameIdx > MaxCSFrameIndex)
37863925 MaxCSFrameIndex = FrameIdx;
37873926 }
3927+ LastReg = Reg;
3928+ }
3929+
3930+ // Add hazard slot in the case where no FPR CSRs are present.
3931+ if (AFI->hasStackHazardSlotIndex () &&
3932+ HazardSlotIndex == std::numeric_limits<int >::max ()) {
3933+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3934+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3935+ << " \n " );
3936+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3937+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3938+ MinCSFrameIndex = HazardSlotIndex;
3939+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3940+ MaxCSFrameIndex = HazardSlotIndex;
37883941 }
3942+
37893943 return true ;
37903944}
37913945
@@ -3798,6 +3952,10 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
37983952 // function doesn't use a FP.
37993953 if (AFI->hasStreamingModeChanges () && !hasFP (MF))
38003954 return false ;
3955+ // Don't allow register salvaging with hazard slots, in case it moves objects
3956+ // into the wrong place.
3957+ if (AFI->hasStackHazardSlotIndex ())
3958+ return false ;
38013959 return AFI->hasCalleeSaveStackFreeSpace ();
38023960}
38033961
@@ -4492,6 +4650,10 @@ struct FrameObject {
44924650 // This object's group (which always contains the object with
44934651 // ObjectFirst==true) should be placed first.
44944652 bool GroupFirst = false ;
4653+
4654+ // Used to distinguish between FP and GPR accesses.
4655+ // 1 = GPR, 2 = FPR, 8 = Hazard Object.
4656+ unsigned Accesses = 0 ;
44954657};
44964658
44974659class GroupBuilder {
@@ -4527,8 +4689,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
45274689 // at the end. This also allows us to stop walking when we hit the
45284690 // first invalid item after it's all sorted.
45294691 //
4530- // The "first" object goes first (closest to SP), followed by the members of
4531- // the "first" group.
4692+ // If we want to include a stack hazard region, order FPR accesses < the
4693+ // hazard object < GPRs accesses in order to create a separation between the
4694+ // two. For the Accesses field 1 = GPR, 2 = FPR, 8 = Hazard Object.
4695+ //
4696+ // Otherwise the "first" object goes first (closest to SP), followed by the
4697+ // members of the "first" group.
45324698 //
45334699 // The rest are sorted by the group index to keep the groups together.
45344700 // Higher numbered groups are more likely to be around longer (i.e. untagged
@@ -4537,9 +4703,15 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
45374703 //
45384704 // If all else equal, sort by the object index to keep the objects in the
45394705 // original order.
4540- return std::make_tuple (!A.IsValid , A.ObjectFirst , A.GroupFirst , A.GroupIndex ,
4706+ if (A.IsValid != B.IsValid )
4707+ return A.IsValid ;
4708+ if (A.Accesses == 2 && B.Accesses != 2 )
4709+ return true ;
4710+ if (A.Accesses == 8 && B.Accesses != 2 )
4711+ return true ;
4712+ return std::make_tuple (A.ObjectFirst , A.GroupFirst , A.GroupIndex ,
45414713 A.ObjectIndex ) <
4542- std::make_tuple (!B. IsValid , B.ObjectFirst , B.GroupFirst , B.GroupIndex ,
4714+ std::make_tuple (B.ObjectFirst , B.GroupFirst , B.GroupIndex ,
45434715 B.ObjectIndex );
45444716}
45454717} // namespace
@@ -4549,6 +4721,7 @@ void AArch64FrameLowering::orderFrameObjects(
45494721 if (!OrderFrameObjects || ObjectsToAllocate.empty ())
45504722 return ;
45514723
4724+ const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
45524725 const MachineFrameInfo &MFI = MF.getFrameInfo ();
45534726 std::vector<FrameObject> FrameObjects (MFI.getObjectIndexEnd ());
45544727 for (auto &Obj : ObjectsToAllocate) {
@@ -4595,16 +4768,28 @@ void AArch64FrameLowering::orderFrameObjects(
45954768 GB.AddMember (TaggedFI);
45964769 else
45974770 GB.EndCurrentGroup ();
4771+
4772+ if (AFI.hasStackHazardSlotIndex ()) {
4773+ std::optional<int > FI = getLdStFrameID (MI, MFI);
4774+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
4775+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
4776+ FrameObjects[*FI].Accesses |= 2 ;
4777+ else
4778+ FrameObjects[*FI].Accesses |= 1 ;
4779+ }
4780+ }
45984781 }
45994782 // Groups should never span multiple basic blocks.
46004783 GB.EndCurrentGroup ();
46014784 }
46024785
4786+ if (AFI.hasStackHazardSlotIndex ())
4787+ FrameObjects[AFI.getStackHazardSlotIndex ()].Accesses = 8 ;
4788+
46034789 // If the function's tagged base pointer is pinned to a stack slot, we want to
46044790 // put that slot first when possible. This will likely place it at SP + 0,
46054791 // and save one instruction when generating the base pointer because IRG does
46064792 // not allow an immediate offset.
4607- const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
46084793 std::optional<int > TBPI = AFI.getTaggedBasePointerIndex ();
46094794 if (TBPI) {
46104795 FrameObjects[*TBPI].ObjectFirst = true ;
0 commit comments