Skip to content

Commit f2dc37e

Browse files
committed
[AArch64][SME] Support split ZPR and PPR area allocation
For a while we have supported the `-aarch64-stack-hazard-size=<size>` option, which adds "hazard padding" between GPRs and FPR/ZPRs. However, there is currently a hole in this mitigation as PPR and FPR/ZPR accesses to the same area also cause streaming memory hazards (this is noted by `-pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=<val>`), and the current stack layout places PPRs and ZPRs within the same area. Which looks like: ------------------------------------ Higher address | callee-saved gpr registers | |---------------------------------- | | lr,fp (a.k.a. "frame record") | |-----------------------------------| <- fp(=x29) | <hazard padding> | |-----------------------------------| | callee-saved fp/simd/SVE regs | |-----------------------------------| | SVE stack objects | |-----------------------------------| | local variables of fixed size | | <FPR> | | <hazard padding> | | <GPR> | ------------------------------------| <- sp | Lower address With this patch the stack (and hazard padding) is rearranged so that hazard padding is placed between the PPRs and ZPRs rather than within the (fixed size) callee-save region. Which looks something like this: ------------------------------------ Higher address | callee-saved gpr registers | |---------------------------------- | | lr,fp (a.k.a. "frame record") | |-----------------------------------| <- fp(=x29) | callee-saved PPRs | | PPR stack objects | (These are SVE predicates) |-----------------------------------| | <hazard padding> | |-----------------------------------| | callee-saved ZPR regs | (These are SVE vectors) | ZPR stack objects | Note: FPRs are promoted to ZPRs |-----------------------------------| | local variables of fixed size | | <FPR> | | <hazard padding> | | <GPR> | ------------------------------------| <- sp | Lower address This layout is only enabled if: * SplitSVEObjects are enabled (`-aarch64-split-sve-objects`) - (This may be enabled by default in a later patch) * Streaming memory hazards are present - (`-aarch64-stack-hazard-size=<val>` != 0) * PPRs and FPRs/ZPRs are on the stack * There's no stack realignment or variable-sized objects - This is left as a TODO for now Additionally, any FPR callee-saves that are present will be promoted to ZPRs. This is to prevent stack hazards between FPRs and GRPs in the fixed size callee-save area (which would otherwise require more hazard padding, or moving the FPR callee-saves). This layout should resolve the hole in the hazard padding mitigation, and is not intended change codegen for non-SME code. Change-Id: I2e1906577c2ac79c40bc69e7c15e3ef09857445f
1 parent 5adbcdf commit f2dc37e

10 files changed

+2138
-403
lines changed

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1612,7 +1612,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
16121612
StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
16131613
MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
16141614
/*PreferFP=*/false,
1615-
/*ForSimm=*/true);
1615+
/*ForSimm=*/true,
1616+
/*FI=*/-1);
16161617
Register SrcReg = FrameReg;
16171618
if (FrameRegOffset) {
16181619
// Use output register as temporary.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 193 additions & 50 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AArch64/AArch64FrameLowering.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
7070
StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
7171
int64_t ObjectOffset, bool isFixed,
7272
bool isSVE, Register &FrameReg,
73-
bool PreferFP, bool ForSimm) const;
73+
bool PreferFP, bool ForSimm,
74+
int64_t FI) const;
7475
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
7576
MachineBasicBlock::iterator MI,
7677
ArrayRef<CalleeSavedInfo> CSI,
@@ -155,7 +156,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
155156
/// Returns the size of the entire ZPR stackframe (calleesaves + spills).
156157
StackOffset getZPRStackSize(const MachineFunction &MF) const;
157158

158-
/// Returns the size of the entire PPR stackframe (calleesaves + spills).
159+
/// Returns the size of the entire PPR stackframe (calleesaves + spills +
160+
/// hazard padding).
159161
StackOffset getPPRStackSize(const MachineFunction &MF) const;
160162

161163
/// Returns the size of the entire SVE stackframe (PPRs + ZPRs).

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
137137
uint64_t StackSizeZPR = 0;
138138
uint64_t StackSizePPR = 0;
139139

140+
/// Are SVE objects (vectors and predicates) split into separate regions on
141+
/// the stack.
142+
bool SplitSVEObjects = false;
143+
140144
/// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid.
141145
bool HasCalculatedStackSizeSVE = false;
142146

@@ -336,7 +340,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
336340

337341
bool isStackRealigned() const { return StackRealigned; }
338342
void setStackRealigned(bool s) { StackRealigned = s; }
339-
340343
bool hasCalleeSaveStackFreeSpace() const {
341344
return CalleeSaveStackHasFreeSpace;
342345
}
@@ -481,7 +484,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
481484
StackHazardCSRSlotIndex = Index;
482485
}
483486

484-
bool hasSplitSVEObjects() const { return false; }
487+
bool hasSplitSVEObjects() const { return SplitSVEObjects; }
488+
void setSplitSVEObjects(bool s) { SplitSVEObjects = s; }
485489

486490
SMEAttrs getSMEFnAttrs() const { return SMEFnAttrs; }
487491

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp

Lines changed: 105 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,36 @@ void AArch64PrologueEmitter::emitPrologue() {
754754
emitCalleeSavedSVELocations(AfterSVESavesI);
755755

756756
if (AFI->hasSplitSVEObjects()) {
757-
reportFatalInternalError("not implemented yet");
757+
assert(!FPAfterSVECalleeSaves &&
758+
"Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
759+
assert(!AFL.canUseRedZone(MF) &&
760+
"Cannot use redzone with aarch64-split-sve-objects");
761+
// TODO: Handle HasWinCFI/NeedsWinCFI?
762+
assert(!NeedsWinCFI &&
763+
"WinCFI with aarch64-split-sve-objects is not supported");
764+
765+
// Split ZPR and PPR allocation.
766+
// Allocate PPR callee saves
767+
allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
768+
EmitAsyncCFI && !HasFP, CFAOffset,
769+
MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
770+
ZPRLocalsSize || PPRLocalsSize);
771+
CFAOffset += PPRCalleeSavesSize;
772+
773+
// Allocate PPR locals + ZPR callee saves
774+
assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
775+
"Expected ZPR callee saves after PPR locals");
776+
allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
777+
PPRLocalsSize + ZPRCalleeSavesSize,
778+
EmitAsyncCFI && !HasFP, CFAOffset,
779+
MFI.hasVarSizedObjects() || ZPRLocalsSize);
780+
CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
781+
782+
// Allocate ZPR locals
783+
allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
784+
ZPRLocalsSize + StackOffset::getFixed(NumBytes),
785+
EmitAsyncCFI && !HasFP, CFAOffset,
786+
MFI.hasVarSizedObjects());
758787
} else {
759788
// Allocate space for the callee saves (if any).
760789
StackOffset LocalsSize =
@@ -1221,8 +1250,10 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
12211250
AFL.getOffsetOfLocalArea();
12221251
}
12231252

1253+
StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
12241254
for (const auto &Info : CSI) {
1225-
if (!MFI.isScalableStackID(Info.getFrameIdx()))
1255+
int FI = Info.getFrameIdx();
1256+
if (!MFI.isScalableStackID(FI))
12261257
continue;
12271258

12281259
// Not all unwinders may know about SVE registers, so assume the lowest
@@ -1233,9 +1264,13 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
12331264
continue;
12341265

12351266
StackOffset Offset =
1236-
StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
1267+
StackOffset::getScalable(MFI.getObjectOffset(FI)) -
12371268
StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
12381269

1270+
if (AFI->hasSplitSVEObjects() &&
1271+
MFI.getStackID(FI) == TargetStackID::ScalableVector)
1272+
Offset -= PPRStackSize;
1273+
12391274
CFIBuilder.insertCFIInst(
12401275
createCFAOffset(RegInfo, Reg, Offset, IncomingVGOffsetFromDefCFA));
12411276
}
@@ -1512,7 +1547,73 @@ void AArch64EpilogueEmitter::emitEpilogue() {
15121547
emitCalleeSavedSVERestores(RestoreEnd);
15131548
}
15141549
} else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
1515-
reportFatalInternalError("not implemented yet");
1550+
assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
1551+
"TODO: Support stack realigment / variable-sized objects");
1552+
// SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
1553+
// areas.
1554+
auto ZPRCalleeSavedSize =
1555+
StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
1556+
auto PPRCalleeSavedSize =
1557+
StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
1558+
StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
1559+
StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
1560+
1561+
MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
1562+
PPRRestoreEnd = FirstGPRRestoreI;
1563+
if (PPRCalleeSavedSize) {
1564+
PPRRestoreBegin = std::prev(PPRRestoreEnd);
1565+
while (PPRRestoreBegin != MBB.begin() &&
1566+
isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
1567+
--PPRRestoreBegin;
1568+
}
1569+
1570+
MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
1571+
ZPRRestoreEnd = PPRRestoreBegin;
1572+
if (ZPRCalleeSavedSize) {
1573+
ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
1574+
while (ZPRRestoreBegin != MBB.begin() &&
1575+
isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
1576+
--ZPRRestoreBegin;
1577+
}
1578+
1579+
auto CFAOffset =
1580+
SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
1581+
if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
1582+
// Deallocate the non-SVE locals first before we can deallocate (and
1583+
// restore callee saves) from the SVE area.
1584+
auto NonSVELocals = StackOffset::getFixed(NumBytes);
1585+
emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
1586+
NonSVELocals, TII, MachineInstr::FrameDestroy, false,
1587+
false, nullptr, EmitCFI && !HasFP, CFAOffset);
1588+
NumBytes = 0;
1589+
CFAOffset -= NonSVELocals;
1590+
}
1591+
1592+
if (ZPRLocalsSize) {
1593+
emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
1594+
ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
1595+
false, nullptr, EmitCFI && !HasFP, CFAOffset);
1596+
CFAOffset -= ZPRLocalsSize;
1597+
}
1598+
1599+
if (PPRLocalsSize || ZPRCalleeSavedSize) {
1600+
assert(PPRRestoreBegin == ZPRRestoreEnd &&
1601+
"Expected PPR restores after ZPR");
1602+
emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
1603+
PPRLocalsSize + ZPRCalleeSavedSize, TII,
1604+
MachineInstr::FrameDestroy, false, false, nullptr,
1605+
EmitCFI && !HasFP, CFAOffset);
1606+
CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
1607+
}
1608+
if (PPRCalleeSavedSize) {
1609+
emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
1610+
PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
1611+
false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
1612+
}
1613+
1614+
// We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
1615+
if (EmitCFI)
1616+
emitCalleeSavedSVERestores(ZPRRestoreEnd);
15161617
}
15171618

15181619
if (!HasFP) {

0 commit comments

Comments
 (0)