diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 0f1e860fac732..0589b14949bf4 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -52,6 +52,8 @@ // | async context if needed | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) +// | | +// |-----------------------------------| // | | // | callee-saved fp/simd/SVE regs | // | | @@ -64,9 +66,11 @@ // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at // |.the.standard.16-byte.alignment....| compile time; if present) // |-----------------------------------| -// | | // | local variables of fixed size | // | including spill slots | +// | | +// | | +// | | // |-----------------------------------| <- bp(not defined by ABI, // |.variable-sized.local.variables....| LLVM chooses X19) // |.(VLAs)............................| (size of this area is unknown at @@ -117,6 +121,20 @@ // // FIXME: also explain the redzone concept. // +// About stack hazards: Under some SME contexts, a coprocessor with its own +// separate cache can used for FP operations. This can create hazards if the CPU +// and the SME unit try to access the same area of memory, including if the +// access is to an area of the stack. To try to alleviate this we attempt to +// introduce extra padding into the stack frame between FP and GPR accesses, +// controlled by the StackHazardSize option. Without changing the layout of the +// stack frame in the diagram above, a stack object of size StackHazardSize is +// added between GPR and FPR CSRs. Another is added to the stack objects +// section, and stack objects are sorted so that FPR > Hazard padding slot > +// GPRs (where possible). Unfortunately some things are not handled well (VLA +// area, arguments on the stack, object with both GPR and FPR accesses), but if +// those are controlled by the user then the entire stack frame becomes GPR at +// the start/end with FPR in the middle, surrounded by Hazard padding. +// // An example of the prologue: // // .globl __foo @@ -196,6 +214,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -253,6 +272,14 @@ cl::opt EnableHomogeneousPrologEpilog( cl::desc("Emit homogeneous prologue and epilogue for the size " "optimization (default = off)")); +// Stack hazard padding size. 0 = disabled. +static cl::opt StackHazardSize("aarch64-stack-hazard-size", + cl::init(0), cl::Hidden); +// Whether to insert padding into non-streaming functions (for testing). +static cl::opt + StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming", + cl::init(false), cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// Returns how much of the incoming argument stack area (in bytes) we should @@ -1461,6 +1488,10 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // update in so create a normal arithmetic instruction instead. if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) { + // If we are destroying the frame, make sure we add the increment after the + // last frame operation. + if (FrameFlag == MachineInstr::FrameDestroy) + ++MBBI; emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag, false, false, nullptr, EmitCFI, @@ -2901,6 +2932,7 @@ static void computeCalleeSaveRegisterPairs( } int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); + Register LastReg = 0; // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -2922,8 +2954,15 @@ static void computeCalleeSaveRegisterPairs( else llvm_unreachable("Unsupported register class."); + // Add the stack hazard size as we transition from GPR->FPR CSRs. + if (AFI->hasStackHazardSlotIndex() && + (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && + AArch64InstrInfo::isFpOrNEON(RPI.Reg1)) + ByteOffset += StackFillDir * StackHazardSize; + LastReg = RPI.Reg1; + // Add the next reg to the pair if it is in the same register class. - if (unsigned(i + RegInc) < Count) { + if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { Register NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; switch (RPI.Type) { @@ -3034,7 +3073,8 @@ static void computeCalleeSaveRegisterPairs( Offset += 8; RPI.Offset = Offset / Scale; - assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || + assert((!RPI.isPaired() || + (!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); @@ -3455,6 +3495,80 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } +// Return the FrameID for a Load/Store instruction by looking at the MMO. +static std::optional getLdStFrameID(const MachineInstr &MI, + const MachineFrameInfo &MFI) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + return std::nullopt; + + MachineMemOperand *MMO = *MI.memoperands_begin(); + auto *PSV = + dyn_cast_or_null(MMO->getPseudoValue()); + if (PSV) + return std::optional(PSV->getFrameIndex()); + + if (MMO->getValue()) { + if (auto *Al = dyn_cast(getUnderlyingObject(MMO->getValue()))) { + for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); + FI++) + if (MFI.getObjectAllocation(FI) == Al) + return FI; + } + } + + return std::nullopt; +} + +// Check if a Hazard slot is needed for the current function, and if so create +// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, +// which can be used to determine if any hazard padding is needed. +void AArch64FrameLowering::determineStackHazardSlot( + MachineFunction &MF, BitVector &SavedRegs) const { + if (StackHazardSize == 0 || StackHazardSize % 16 != 0 || + MF.getInfo()->hasStackHazardSlotIndex()) + return; + + // Stack hazards are only needed in streaming functions. + SMEAttrs Attrs(MF.getFunction()); + if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody()) + return; + + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Add a hazard slot if there are any CSR FPR registers, or are any fp-only + // stack objects. + bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { + return AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR128RegClass.contains(Reg) || + AArch64::ZPRRegClass.contains(Reg) || + AArch64::PPRRegClass.contains(Reg); + }); + bool HasFPRStackObjects = false; + if (!HasFPRCSRs) { + std::vector FrameObjects(MFI.getObjectIndexEnd()); + for (auto &MBB : MF) { + for (auto &MI : MBB) { + std::optional FI = getLdStFrameID(MI, MFI); + if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { + if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI)) + FrameObjects[*FI] |= 2; + else + FrameObjects[*FI] |= 1; + } + } + } + HasFPRStackObjects = + any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; }); + } + + if (HasFPRCSRs || HasFPRStackObjects) { + int ID = MFI.CreateStackObject(StackHazardSize, Align(16), false); + LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size " + << StackHazardSize << "\n"); + MF.getInfo()->setStackHazardSlotIndex(ID); + } +} + void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { @@ -3595,6 +3709,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, CSStackSize += 8; } + // Determine if a Hazard slot should be used, and increase the CSStackSize by + // StackHazardSize if so. + determineStackHazardSlot(MF, SavedRegs); + if (AFI->hasStackHazardSlotIndex()) + CSStackSize += StackHazardSize; + // Save number of saved regs, so we can easily update CSStackSize later. unsigned NumSavedRegs = SavedRegs.count(); @@ -3761,10 +3881,28 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end()); } + Register LastReg = 0; + int HazardSlotIndex = std::numeric_limits::max(); for (auto &CS : CSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // Create a hazard slot as we switch between GPR and FPR CSRs. + if (AFI->hasStackHazardSlotIndex() && + (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && + AArch64InstrInfo::isFpOrNEON(Reg)) { + assert(HazardSlotIndex == std::numeric_limits::max() && + "Unexpected register order for hazard slot"); + HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); + LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex + << "\n"); + AFI->setStackHazardCSRSlotIndex(HazardSlotIndex); + if ((unsigned)HazardSlotIndex < MinCSFrameIndex) + MinCSFrameIndex = HazardSlotIndex; + if ((unsigned)HazardSlotIndex > MaxCSFrameIndex) + MaxCSFrameIndex = HazardSlotIndex; + } + unsigned Size = RegInfo->getSpillSize(*RC); Align Alignment(RegInfo->getSpillAlign(*RC)); int FrameIdx = MFI.CreateStackObject(Size, Alignment, true); @@ -3785,7 +3923,22 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } + LastReg = Reg; + } + + // Add hazard slot in the case where no FPR CSRs are present. + if (AFI->hasStackHazardSlotIndex() && + HazardSlotIndex == std::numeric_limits::max()) { + HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); + LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex + << "\n"); + AFI->setStackHazardCSRSlotIndex(HazardSlotIndex); + if ((unsigned)HazardSlotIndex < MinCSFrameIndex) + MinCSFrameIndex = HazardSlotIndex; + if ((unsigned)HazardSlotIndex > MaxCSFrameIndex) + MaxCSFrameIndex = HazardSlotIndex; } + return true; } @@ -3798,6 +3951,10 @@ bool AArch64FrameLowering::enableStackSlotScavenging( // function doesn't use a FP. if (AFI->hasStreamingModeChanges() && !hasFP(MF)) return false; + // Don't allow register salvaging with hazard slots, in case it moves objects + // into the wrong place. + if (AFI->hasStackHazardSlotIndex()) + return false; return AFI->hasCalleeSaveStackFreeSpace(); } @@ -4492,6 +4649,11 @@ struct FrameObject { // This object's group (which always contains the object with // ObjectFirst==true) should be placed first. bool GroupFirst = false; + + // Used to distinguish between FP and GPR accesses. The values are decided so + // that they sort FPR < Hazard < GPR and they can be or'd together. + unsigned Accesses = 0; + enum { AccessFPR = 1, AccessHazard = 2, AccessGPR = 4 }; }; class GroupBuilder { @@ -4527,8 +4689,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { // at the end. This also allows us to stop walking when we hit the // first invalid item after it's all sorted. // - // The "first" object goes first (closest to SP), followed by the members of - // the "first" group. + // If we want to include a stack hazard region, order FPR accesses < the + // hazard object < GPRs accesses in order to create a separation between the + // two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR. + // + // Otherwise the "first" object goes first (closest to SP), followed by the + // members of the "first" group. // // The rest are sorted by the group index to keep the groups together. // Higher numbered groups are more likely to be around longer (i.e. untagged @@ -4537,10 +4703,10 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { // // If all else equal, sort by the object index to keep the objects in the // original order. - return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex, - A.ObjectIndex) < - std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex, - B.ObjectIndex); + return std::make_tuple(!A.IsValid, A.Accesses, A.ObjectFirst, A.GroupFirst, + A.GroupIndex, A.ObjectIndex) < + std::make_tuple(!B.IsValid, B.Accesses, B.ObjectFirst, B.GroupFirst, + B.GroupIndex, B.ObjectIndex); } } // namespace @@ -4549,6 +4715,7 @@ void AArch64FrameLowering::orderFrameObjects( if (!OrderFrameObjects || ObjectsToAllocate.empty()) return; + const AArch64FunctionInfo &AFI = *MF.getInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); std::vector FrameObjects(MFI.getObjectIndexEnd()); for (auto &Obj : ObjectsToAllocate) { @@ -4556,12 +4723,24 @@ void AArch64FrameLowering::orderFrameObjects( FrameObjects[Obj].ObjectIndex = Obj; } - // Identify stack slots that are tagged at the same time. + // Identify FPR vs GPR slots for hazards, and stack slots that are tagged at + // the same time. GroupBuilder GB(FrameObjects); for (auto &MBB : MF) { for (auto &MI : MBB) { if (MI.isDebugInstr()) continue; + + if (AFI.hasStackHazardSlotIndex()) { + std::optional FI = getLdStFrameID(MI, MFI); + if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { + if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI)) + FrameObjects[*FI].Accesses |= FrameObject::AccessFPR; + else + FrameObjects[*FI].Accesses |= FrameObject::AccessGPR; + } + } + int OpIndex; switch (MI.getOpcode()) { case AArch64::STGloop: @@ -4600,11 +4779,20 @@ void AArch64FrameLowering::orderFrameObjects( GB.EndCurrentGroup(); } + if (AFI.hasStackHazardSlotIndex()) { + FrameObjects[AFI.getStackHazardSlotIndex()].Accesses = + FrameObject::AccessHazard; + // If a stack object is unknown or both GPR and FPR, sort it into GPR. + for (auto &Obj : FrameObjects) + if (!Obj.Accesses || + Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR)) + Obj.Accesses = FrameObject::AccessGPR; + } + // If the function's tagged base pointer is pinned to a stack slot, we want to // put that slot first when possible. This will likely place it at SP + 0, // and save one instruction when generating the base pointer because IRG does // not allow an immediate offset. - const AArch64FunctionInfo &AFI = *MF.getInfo(); std::optional TBPI = AFI.getTaggedBasePointerIndex(); if (TBPI) { FrameObjects[*TBPI].ObjectFirst = true; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 941af03a78b73..da315850d6362 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -155,6 +155,10 @@ class AArch64FrameLowering : public TargetFrameLowering { int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset, bool FollowupAllocs) const; + /// Make a determination whether a Hazard slot is used and create it if + /// needed. + void determineStackHazardSlot(MachineFunction &MF, + BitVector &SavedRegs) const; /// Emit target zero call-used regs. void emitZeroCallUsedRegs(BitVector RegsToZero, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 001521d1101eb..72f110cebbdc8 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -109,6 +109,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// registers. unsigned VarArgsFPRSize = 0; + /// The stack slots used to add space between FPR and GPR accesses when using + /// hazard padding. StackHazardCSRSlotIndex is added between GPR and FPR CSRs. + /// StackHazardSlotIndex is added between (sorted) stack objects. + int StackHazardSlotIndex = std::numeric_limits::max(); + int StackHazardCSRSlotIndex = std::numeric_limits::max(); + /// True if this function has a subset of CSRs that is handled explicitly via /// copies. bool IsSplitCSR = false; @@ -346,6 +352,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { MaxOffset = std::max(Offset + ObjSize, MaxOffset); } + if (StackHazardCSRSlotIndex != std::numeric_limits::max()) { + int64_t Offset = MFI.getObjectOffset(StackHazardCSRSlotIndex); + int64_t ObjSize = MFI.getObjectSize(StackHazardCSRSlotIndex); + MinOffset = std::min(Offset, MinOffset); + MaxOffset = std::max(Offset + ObjSize, MaxOffset); + } + unsigned Size = alignTo(MaxOffset - MinOffset, 16); assert((!HasCalleeSavedStackSize || getCalleeSavedStackSize() == Size) && "Invalid size calculated for callee saves"); @@ -403,6 +416,20 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } + bool hasStackHazardSlotIndex() const { + return StackHazardSlotIndex != std::numeric_limits::max(); + } + int getStackHazardSlotIndex() const { return StackHazardSlotIndex; } + void setStackHazardSlotIndex(int Index) { + assert(StackHazardSlotIndex == std::numeric_limits::max()); + StackHazardSlotIndex = Index; + } + int getStackHazardCSRSlotIndex() const { return StackHazardCSRSlotIndex; } + void setStackHazardCSRSlotIndex(int Index) { + assert(StackHazardCSRSlotIndex == std::numeric_limits::max()); + StackHazardCSRSlotIndex = Index; + } + unsigned getSRetReturnReg() const { return SRetReturnReg; } void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll new file mode 100644 index 0000000000000..50a2e41f45756 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -0,0 +1,3051 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024 + +define i32 @basic(i32 noundef %num) { +; CHECK-LABEL: basic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + ret i32 0 +} + +; Non-streaming functions don't need hazards +define i32 @csr_d8_notsc(i32 noundef %num) { +; CHECK-LABEL: csr_d8_notsc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset b8, -16 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret i32 0 +} + +; Very simple - doesn't require hazards +define i32 @basic_sc(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: basic_sc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + ret i32 0 +} + +; No fpr accesses/csrs - doesn't require hazards +define i32 @nocsr_alloci64(i64 %d) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: nocsr_alloci64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: str x8, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %a = alloca i64 + store i64 %d, ptr %a + ret i32 0 +} + +; No fpr accesses/csrs - doesn't require hazards +define i32 @csr_x20(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: csr_x20: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x20, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x20, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret i32 0 +} + +; CSR of d8. Make sure there is a gap between FPR and GPR +define i32 @csr_d8(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2064 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret i32 0 +} + +; Stack fpr objects. +define i32 @nocsr_allocd(double %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: nocsr_allocd: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #16 +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: str d0, [sp, #8] +; CHECK0-NEXT: add sp, sp, #16 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: nocsr_allocd: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: .cfi_def_cfa_offset 80 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: str d0, [sp, #72] +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: nocsr_allocd: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: str d0, [sp, #1032] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + store double %d, ptr %a + ret i32 0 +} + +define i32 @csr_d8d9(i32 noundef %num) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8d9: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -8 +; CHECK0-NEXT: .cfi_offset b9, -16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldp d9, d8, [sp], #16 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8d9: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset b8, -72 +; CHECK64-NEXT: .cfi_offset b9, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8d9: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: stp d9, d8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_offset b8, -1048 +; CHECK1024-NEXT: .cfi_offset b9, -1056 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldp d9, d8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8},~{d9}"() #1 + ret i32 0 +} + +define i32 @csr_d8_allocd(double %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_allocd: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp d8, d0, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_allocd: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #160 +; CHECK64-NEXT: stp d0, d8, [sp, #72] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 160 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #160 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_allocd: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str d0, [sp, #1032] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store double %d, ptr %a + ret i32 0 +} + +define i32 @csr_d8_alloci64(i64 %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_alloci64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: str x8, [sp, #8] +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_alloci64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #160 +; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 160 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: add sp, sp, #160 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_alloci64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + tail call void asm sideeffect "", "~{d8}"() #1 + store i64 %d, ptr %a + ret i32 0 +} + +; Check the frame pointer is in the right place +define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK0-LABEL: csr_d8_allocd_framepointer: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill +; CHECK0-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: add x29, sp, #16 +; CHECK0-NEXT: .cfi_def_cfa w29, 16 +; CHECK0-NEXT: .cfi_offset w30, -8 +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_offset b8, -32 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: str d0, [sp, #8] +; CHECK0-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_allocd_framepointer: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #176 +; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: stp x29, x30, [sp, #152] // 16-byte Folded Spill +; CHECK64-NEXT: add x29, sp, #80 +; CHECK64-NEXT: .cfi_def_cfa w29, 96 +; CHECK64-NEXT: .cfi_offset w30, -16 +; CHECK64-NEXT: .cfi_offset w29, -24 +; CHECK64-NEXT: .cfi_offset b8, -96 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: stur d0, [x29, #-8] +; CHECK64-NEXT: ldr x29, [sp, #152] // 8-byte Folded Reload +; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: add sp, sp, #176 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_allocd_framepointer: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: mov x29, sp +; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_def_cfa w29, 1056 +; CHECK1024-NEXT: .cfi_offset w30, -16 +; CHECK1024-NEXT: .cfi_offset w29, -24 +; CHECK1024-NEXT: .cfi_offset b8, -1056 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: stur d0, [x29, #-8] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store double %d, ptr %a + ret i32 0 +} + +; sve stack objects should live with other fpr registers +define i32 @csr_d8_allocnxv4i32(i64 %d) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_allocnxv4i32: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: str x29, [sp, #8] // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -8 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov z0.s, #0 // =0x0 +; CHECK0-NEXT: ptrue p0.s +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: st1w { z0.s }, p0, [sp] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_allocnxv4i32: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: str d8, [sp, #-80]! // 8-byte Folded Spill +; CHECK64-NEXT: str x29, [sp, #72] // 8-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -8 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: mov z0.s, #0 // =0x0 +; CHECK64-NEXT: ptrue p0.s +; CHECK64-NEXT: add x8, sp, #64 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: st1w { z0.s }, p0, [x8] +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr x29, [sp, #72] // 8-byte Folded Reload +; CHECK64-NEXT: ldr d8, [sp], #80 // 8-byte Folded Reload +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_allocnxv4i32: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: mov z0.s, #0 // =0x0 +; CHECK1024-NEXT: ptrue p0.s +; CHECK1024-NEXT: add x8, sp, #1024 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: st1w { z0.s }, p0, [x8] +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + ret i32 0 +} + +define i32 @csr_x18_25_d8_15_allocdi64(i64 %d, double %e) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_x18_25_d8_15_allocdi64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #144 +; CHECK0-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: str x25, [sp, #80] // 8-byte Folded Spill +; CHECK0-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 144 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w23, -40 +; CHECK0-NEXT: .cfi_offset w24, -48 +; CHECK0-NEXT: .cfi_offset w25, -64 +; CHECK0-NEXT: .cfi_offset b8, -72 +; CHECK0-NEXT: .cfi_offset b9, -80 +; CHECK0-NEXT: .cfi_offset b10, -88 +; CHECK0-NEXT: .cfi_offset b11, -96 +; CHECK0-NEXT: .cfi_offset b12, -104 +; CHECK0-NEXT: .cfi_offset b13, -112 +; CHECK0-NEXT: .cfi_offset b14, -120 +; CHECK0-NEXT: .cfi_offset b15, -128 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x25, [sp, #80] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: str x8, [sp, #88] +; CHECK0-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: str d0, [sp, #8] +; CHECK0-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: add sp, sp, #144 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_x18_25_d8_15_allocdi64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #288 +; CHECK64-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x25, [sp, #224] // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #240] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #256] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 288 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w21, -24 +; CHECK64-NEXT: .cfi_offset w22, -32 +; CHECK64-NEXT: .cfi_offset w23, -40 +; CHECK64-NEXT: .cfi_offset w24, -48 +; CHECK64-NEXT: .cfi_offset w25, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_offset b8, -136 +; CHECK64-NEXT: .cfi_offset b9, -144 +; CHECK64-NEXT: .cfi_offset b10, -152 +; CHECK64-NEXT: .cfi_offset b11, -160 +; CHECK64-NEXT: .cfi_offset b12, -168 +; CHECK64-NEXT: .cfi_offset b13, -176 +; CHECK64-NEXT: .cfi_offset b14, -184 +; CHECK64-NEXT: .cfi_offset b15, -192 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: ldp x20, x19, [sp, #272] // 16-byte Folded Reload +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldp x22, x21, [sp, #256] // 16-byte Folded Reload +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: ldp x24, x23, [sp, #240] // 16-byte Folded Reload +; CHECK64-NEXT: str d0, [sp, #88] +; CHECK64-NEXT: ldp x29, x25, [sp, #224] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #288 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_x18_25_d8_15_allocdi64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1152 +; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK1024-NEXT: str x25, [sp, #1096] // 8-byte Folded Spill +; CHECK1024-NEXT: str x24, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x23, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1136] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1144] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2208 +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w21, -24 +; CHECK1024-NEXT: .cfi_offset w22, -32 +; CHECK1024-NEXT: .cfi_offset w23, -40 +; CHECK1024-NEXT: .cfi_offset w24, -48 +; CHECK1024-NEXT: .cfi_offset w25, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_offset b8, -1096 +; CHECK1024-NEXT: .cfi_offset b9, -1104 +; CHECK1024-NEXT: .cfi_offset b10, -1112 +; CHECK1024-NEXT: .cfi_offset b11, -1120 +; CHECK1024-NEXT: .cfi_offset b12, -1128 +; CHECK1024-NEXT: .cfi_offset b13, -1136 +; CHECK1024-NEXT: .cfi_offset b14, -1144 +; CHECK1024-NEXT: .cfi_offset b15, -1152 +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: str d0, [sp, #1048] +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1144] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x23, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x24, [sp, #1104] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x25, [sp, #1096] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1152 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + %b = alloca double + tail call void asm sideeffect "", "~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25}"() + tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() + store i64 %d, ptr %a + store double %e, ptr %b + ret i32 0 +} + +define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarch64_pstate_sm_body" "target-features"="+sme" { +; CHECK0-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 176 +; CHECK0-NEXT: rdsvl x9, #1 +; CHECK0-NEXT: stp d15, d14, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: lsr x9, x9, #3 +; CHECK0-NEXT: stp d13, d12, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #80] // 16-byte Folded Spill +; CHECK0-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: str x25, [sp, #112] // 8-byte Folded Spill +; CHECK0-NEXT: stp x24, x23, [sp, #128] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w23, -40 +; CHECK0-NEXT: .cfi_offset w24, -48 +; CHECK0-NEXT: .cfi_offset w25, -64 +; CHECK0-NEXT: .cfi_offset b8, -72 +; CHECK0-NEXT: .cfi_offset b9, -80 +; CHECK0-NEXT: .cfi_offset b10, -88 +; CHECK0-NEXT: .cfi_offset b11, -96 +; CHECK0-NEXT: .cfi_offset b12, -104 +; CHECK0-NEXT: .cfi_offset b13, -112 +; CHECK0-NEXT: .cfi_offset b14, -120 +; CHECK0-NEXT: .cfi_offset b15, -128 +; CHECK0-NEXT: .cfi_offset vg, -136 +; CHECK0-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK0-NEXT: str x0, [sp, #24] +; CHECK0-NEXT: str d0, [sp, #16] +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x25, [sp, #112] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d9, d8, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d11, d10, [sp, #80] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: add sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w20 +; CHECK0-NEXT: .cfi_restore w21 +; CHECK0-NEXT: .cfi_restore w22 +; CHECK0-NEXT: .cfi_restore w23 +; CHECK0-NEXT: .cfi_restore w24 +; CHECK0-NEXT: .cfi_restore w25 +; CHECK0-NEXT: .cfi_restore b8 +; CHECK0-NEXT: .cfi_restore b9 +; CHECK0-NEXT: .cfi_restore b10 +; CHECK0-NEXT: .cfi_restore b11 +; CHECK0-NEXT: .cfi_restore b12 +; CHECK0-NEXT: .cfi_restore b13 +; CHECK0-NEXT: .cfi_restore b14 +; CHECK0-NEXT: .cfi_restore b15 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #304 +; CHECK64-NEXT: .cfi_def_cfa_offset 304 +; CHECK64-NEXT: rdsvl x9, #1 +; CHECK64-NEXT: stp d15, d14, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: lsr x9, x9, #3 +; CHECK64-NEXT: stp d13, d12, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: str x9, [sp, #96] // 8-byte Folded Spill +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: str x9, [sp, #104] // 8-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #160] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x25, [sp, #240] // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #256] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w21, -24 +; CHECK64-NEXT: .cfi_offset w22, -32 +; CHECK64-NEXT: .cfi_offset w23, -40 +; CHECK64-NEXT: .cfi_offset w24, -48 +; CHECK64-NEXT: .cfi_offset w25, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_offset b8, -136 +; CHECK64-NEXT: .cfi_offset b9, -144 +; CHECK64-NEXT: .cfi_offset b10, -152 +; CHECK64-NEXT: .cfi_offset b11, -160 +; CHECK64-NEXT: .cfi_offset b12, -168 +; CHECK64-NEXT: .cfi_offset b13, -176 +; CHECK64-NEXT: .cfi_offset b14, -184 +; CHECK64-NEXT: .cfi_offset b15, -192 +; CHECK64-NEXT: .cfi_offset vg, -200 +; CHECK64-NEXT: str d0, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: ldr d0, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: str x0, [sp, #8] +; CHECK64-NEXT: str d0, [sp, #88] +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x24, x23, [sp, #256] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x25, [sp, #240] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #144] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d15, d14, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #304 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w20 +; CHECK64-NEXT: .cfi_restore w21 +; CHECK64-NEXT: .cfi_restore w22 +; CHECK64-NEXT: .cfi_restore w23 +; CHECK64-NEXT: .cfi_restore w24 +; CHECK64-NEXT: .cfi_restore w25 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: .cfi_restore b8 +; CHECK64-NEXT: .cfi_restore b9 +; CHECK64-NEXT: .cfi_restore b10 +; CHECK64-NEXT: .cfi_restore b11 +; CHECK64-NEXT: .cfi_restore b12 +; CHECK64-NEXT: .cfi_restore b13 +; CHECK64-NEXT: .cfi_restore b14 +; CHECK64-NEXT: .cfi_restore b15 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: rdsvl x9, #1 +; CHECK1024-NEXT: lsr x9, x9, #3 +; CHECK1024-NEXT: sub sp, sp, #1168 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1168 +; CHECK1024-NEXT: str x9, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; CHECK1024-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x25, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x24, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x23, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1136] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1144] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1152] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1160] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w21, -24 +; CHECK1024-NEXT: .cfi_offset w22, -32 +; CHECK1024-NEXT: .cfi_offset w23, -40 +; CHECK1024-NEXT: .cfi_offset w24, -48 +; CHECK1024-NEXT: .cfi_offset w25, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_offset b8, -1096 +; CHECK1024-NEXT: .cfi_offset b9, -1104 +; CHECK1024-NEXT: .cfi_offset b10, -1112 +; CHECK1024-NEXT: .cfi_offset b11, -1120 +; CHECK1024-NEXT: .cfi_offset b12, -1128 +; CHECK1024-NEXT: .cfi_offset b13, -1136 +; CHECK1024-NEXT: .cfi_offset b14, -1144 +; CHECK1024-NEXT: .cfi_offset b15, -1152 +; CHECK1024-NEXT: .cfi_offset vg, -1160 +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2224 +; CHECK1024-NEXT: str d0, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: ldr d0, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: str x0, [sp, #8] +; CHECK1024-NEXT: str d0, [sp, #1048] +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1168 +; CHECK1024-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1160] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1152] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1144] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x23, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x24, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x25, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1104] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1168 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w20 +; CHECK1024-NEXT: .cfi_restore w21 +; CHECK1024-NEXT: .cfi_restore w22 +; CHECK1024-NEXT: .cfi_restore w23 +; CHECK1024-NEXT: .cfi_restore w24 +; CHECK1024-NEXT: .cfi_restore w25 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: .cfi_restore b8 +; CHECK1024-NEXT: .cfi_restore b9 +; CHECK1024-NEXT: .cfi_restore b10 +; CHECK1024-NEXT: .cfi_restore b11 +; CHECK1024-NEXT: .cfi_restore b12 +; CHECK1024-NEXT: .cfi_restore b13 +; CHECK1024-NEXT: .cfi_restore b14 +; CHECK1024-NEXT: .cfi_restore b15 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + %b = alloca double + tail call void asm sideeffect "", "~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25}"() + tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() + store i64 %d, ptr %a + store double %e, ptr %b + ret i32 0 +} + +; We don't currently handle fpr stack arguments very well (they are hopefully relatively rare). +define float @nocsr_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: nocsr_stackargs: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr s0, [sp] +; CHECK-NEXT: ret +entry: + ret float %i +} + +define float @csr_x20_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: csr_x20_stackargs: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x20, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: ldr s0, [sp, #16] +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x20, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +define float @csr_d8_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: csr_d8_stackargs: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: ldr s0, [sp, #16] +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: csr_d8_stackargs: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: ldr s0, [sp, #144] +; CHECK64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: csr_d8_stackargs: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2064 +; CHECK1024-NEXT: .cfi_offset w29, -8 +; CHECK1024-NEXT: .cfi_offset b8, -1040 +; CHECK1024-NEXT: ldr s0, [sp, #2064] +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret float %i +} + +; SVE calling conventions +define i32 @svecc_basic(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: svecc_basic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + ret i32 0 +} + +define i32 @svecc_csr_x20(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: svecc_csr_x20: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x20, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x20, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret i32 0 +} + +define i32 @svecc_csr_d8(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret i32 0 +} + +define i32 @svecc_csr_d8d9(i32 noundef %num, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8d9: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-2 +; CHECK0-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #2 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8d9: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-2 +; CHECK64-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 16 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 80 - 16 * VG +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #2 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8d9: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-2 +; CHECK1024-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1040 - 16 * VG +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #2 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{d8},~{d9}"() #1 + ret i32 0 +} + +define i32 @svecc_csr_d8_allocd(double %d, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8_allocd: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: addvl x8, sp, #1 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: str d0, [x8, #8] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8_allocd: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: str d0, [sp, #72] +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8_allocd: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str d0, [sp, #1032] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store double %d, ptr %a + ret i32 0 +} + +define i32 @svecc_csr_d8_alloci64(i64 %d, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8_alloci64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: addvl x9, sp, #1 +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: str x8, [x9, #8] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8_alloci64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 8 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8_alloci64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + tail call void asm sideeffect "", "~{d8}"() #1 + store i64 %d, ptr %a + ret i32 0 +} + +define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK0-NEXT: mov z0.s, #0 // =0x0 +; CHECK0-NEXT: ptrue p0.s +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: st1w { z0.s }, p0, [sp] +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x01, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 144 + 16 * VG +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xb0, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 80 - 8 * VG +; CHECK64-NEXT: mov z0.s, #0 // =0x0 +; CHECK64-NEXT: ptrue p0.s +; CHECK64-NEXT: add x8, sp, #64 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: st1w { z0.s }, p0, [x8] +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_d8_allocnxv4i32: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 8 * VG +; CHECK1024-NEXT: mov z0.s, #0 // =0x0 +; CHECK1024-NEXT: ptrue p0.s +; CHECK1024-NEXT: add x8, sp, #1024 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: st1w { z0.s }, p0, [x8] +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %a = alloca + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + ret i32 0 +} + +define i32 @svecc_csr_x18_25_d8_15_allocdi64(i64 %d, double %e, %vs) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK0-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-8 +; CHECK0-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: sub sp, sp, #16 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 64 * VG +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w23, -40 +; CHECK0-NEXT: .cfi_offset w24, -48 +; CHECK0-NEXT: .cfi_offset w25, -56 +; CHECK0-NEXT: .cfi_offset w29, -64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: str x8, [sp, #8] +; CHECK0-NEXT: str d0, [sp], #16 +; CHECK0-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #8 +; CHECK0-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x25, [sp], #64 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #128 +; CHECK64-NEXT: stp x29, x25, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-8 +; CHECK64-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #96 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 224 + 64 * VG +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w21, -24 +; CHECK64-NEXT: .cfi_offset w22, -32 +; CHECK64-NEXT: .cfi_offset w23, -40 +; CHECK64-NEXT: .cfi_offset w24, -48 +; CHECK64-NEXT: .cfi_offset w25, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: str x8, [sp, #8] +; CHECK64-NEXT: str d0, [sp, #88] +; CHECK64-NEXT: add sp, sp, #96 +; CHECK64-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #8 +; CHECK64-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x25, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_csr_x18_25_d8_15_allocdi64: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1088 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x25, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x24, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x23, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-8 +; CHECK1024-NEXT: str z15, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z12, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z11, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z10, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z9, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w21, -24 +; CHECK1024-NEXT: .cfi_offset w22, -32 +; CHECK1024-NEXT: .cfi_offset w23, -40 +; CHECK1024-NEXT: .cfi_offset w24, -48 +; CHECK1024-NEXT: .cfi_offset w25, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: str x8, [sp, #8] +; CHECK1024-NEXT: str d0, [sp, #1048] +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ldr z15, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #8 +; CHECK1024-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x23, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x24, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x25, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1088 +; CHECK1024-NEXT: ret +entry: + %a = alloca i64 + %b = alloca double + tail call void asm sideeffect "", "~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25}"() + tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15}"() + store i64 %d, ptr %a + store double %e, ptr %b + ret i32 0 +} + + +define [2 x ] @sve_signature_pred_2xv4i1([2 x ] %arg1, [2 x ] %arg2) nounwind "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: sve_signature_pred_2xv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov p1.b, p3.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: ret + ret [2 x ] %arg2 +} + +define [2 x ] @sve_signature_pred_2xv4i1_caller([2 x ] %arg1, [2 x ] %arg2) nounwind "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK0: // %bb.0: +; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: mov p5.b, p0.b +; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: mov p4.b, p1.b +; CHECK0-NEXT: mov p0.b, p2.b +; CHECK0-NEXT: mov p1.b, p3.b +; CHECK0-NEXT: mov p2.b, p5.b +; CHECK0-NEXT: mov p3.b, p4.b +; CHECK0-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #1 +; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK64: // %bb.0: +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: mov p4.b, p1.b +; CHECK64-NEXT: mov p5.b, p0.b +; CHECK64-NEXT: mov p0.b, p2.b +; CHECK64-NEXT: mov p1.b, p3.b +; CHECK64-NEXT: mov p2.b, p5.b +; CHECK64-NEXT: mov p3.b, p4.b +; CHECK64-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #1 +; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #80 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK1024: // %bb.0: +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: mov p4.b, p1.b +; CHECK1024-NEXT: mov p5.b, p0.b +; CHECK1024-NEXT: mov p0.b, p2.b +; CHECK1024-NEXT: mov p1.b, p3.b +; CHECK1024-NEXT: mov p2.b, p5.b +; CHECK1024-NEXT: mov p3.b, p4.b +; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #1 +; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret + %res = call [2 x ] @sve_signature_pred_2xv4i1([2 x ] %arg2, [2 x ] %arg1) + ret [2 x ] %res +} + +define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: f128_libcall: +; CHECK0: // %bb.0: +; CHECK0-NEXT: sub sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 176 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill +; CHECK0-NEXT: stp x30, x9, [sp, #128] // 16-byte Folded Spill +; CHECK0-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w21, -24 +; CHECK0-NEXT: .cfi_offset w22, -32 +; CHECK0-NEXT: .cfi_offset w30, -48 +; CHECK0-NEXT: .cfi_offset b8, -56 +; CHECK0-NEXT: .cfi_offset b9, -64 +; CHECK0-NEXT: .cfi_offset b10, -72 +; CHECK0-NEXT: .cfi_offset b11, -80 +; CHECK0-NEXT: .cfi_offset b12, -88 +; CHECK0-NEXT: .cfi_offset b13, -96 +; CHECK0-NEXT: .cfi_offset b14, -104 +; CHECK0-NEXT: .cfi_offset b15, -112 +; CHECK0-NEXT: mov w19, w1 +; CHECK0-NEXT: mov w20, w0 +; CHECK0-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK0-NEXT: stp q2, q3, [sp, #32] // 32-byte Folded Spill +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x21, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -40 +; CHECK0-NEXT: tbz w21, #0, .LBB27_2 +; CHECK0-NEXT: // %bb.1: +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB27_2: +; CHECK0-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK0-NEXT: bl __lttf2 +; CHECK0-NEXT: tbz w21, #0, .LBB27_4 +; CHECK0-NEXT: // %bb.3: +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB27_4: +; CHECK0-NEXT: cmp w0, #0 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: cset w21, lt +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x22, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -40 +; CHECK0-NEXT: tbz w22, #0, .LBB27_6 +; CHECK0-NEXT: // %bb.5: +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB27_6: +; CHECK0-NEXT: ldp q0, q1, [sp, #32] // 32-byte Folded Reload +; CHECK0-NEXT: bl __getf2 +; CHECK0-NEXT: tbz w22, #0, .LBB27_8 +; CHECK0-NEXT: // %bb.7: +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB27_8: +; CHECK0-NEXT: cmp w0, #0 +; CHECK0-NEXT: cset w8, ge +; CHECK0-NEXT: tst w8, w21 +; CHECK0-NEXT: csel w0, w20, w19, ne +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: add sp, sp, #176 +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w20 +; CHECK0-NEXT: .cfi_restore w21 +; CHECK0-NEXT: .cfi_restore w22 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore b8 +; CHECK0-NEXT: .cfi_restore b9 +; CHECK0-NEXT: .cfi_restore b10 +; CHECK0-NEXT: .cfi_restore b11 +; CHECK0-NEXT: .cfi_restore b12 +; CHECK0-NEXT: .cfi_restore b13 +; CHECK0-NEXT: .cfi_restore b14 +; CHECK0-NEXT: .cfi_restore b15 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: f128_libcall: +; CHECK64: // %bb.0: +; CHECK64-NEXT: sub sp, sp, #320 +; CHECK64-NEXT: .cfi_def_cfa_offset 320 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp d15, d14, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp d13, d12, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #160] // 16-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #176] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x30, [sp, #256] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x22, [sp, #272] // 16-byte Folded Spill +; CHECK64-NEXT: stp x21, x20, [sp, #288] // 16-byte Folded Spill +; CHECK64-NEXT: str x19, [sp, #304] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -16 +; CHECK64-NEXT: .cfi_offset w20, -24 +; CHECK64-NEXT: .cfi_offset w21, -32 +; CHECK64-NEXT: .cfi_offset w22, -40 +; CHECK64-NEXT: .cfi_offset w30, -56 +; CHECK64-NEXT: .cfi_offset w29, -64 +; CHECK64-NEXT: .cfi_offset b8, -136 +; CHECK64-NEXT: .cfi_offset b9, -144 +; CHECK64-NEXT: .cfi_offset b10, -152 +; CHECK64-NEXT: .cfi_offset b11, -160 +; CHECK64-NEXT: .cfi_offset b12, -168 +; CHECK64-NEXT: .cfi_offset b13, -176 +; CHECK64-NEXT: .cfi_offset b14, -184 +; CHECK64-NEXT: .cfi_offset b15, -192 +; CHECK64-NEXT: mov w19, w1 +; CHECK64-NEXT: mov w20, w0 +; CHECK64-NEXT: stp q0, q1, [sp, #64] // 32-byte Folded Spill +; CHECK64-NEXT: stp q2, q3, [sp, #96] // 32-byte Folded Spill +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x21, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -48 +; CHECK64-NEXT: tbz w21, #0, .LBB27_2 +; CHECK64-NEXT: // %bb.1: +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB27_2: +; CHECK64-NEXT: ldp q0, q1, [sp, #64] // 32-byte Folded Reload +; CHECK64-NEXT: bl __lttf2 +; CHECK64-NEXT: tbz w21, #0, .LBB27_4 +; CHECK64-NEXT: // %bb.3: +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB27_4: +; CHECK64-NEXT: cmp w0, #0 +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: cset w21, lt +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x22, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -48 +; CHECK64-NEXT: tbz w22, #0, .LBB27_6 +; CHECK64-NEXT: // %bb.5: +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB27_6: +; CHECK64-NEXT: ldp q0, q1, [sp, #96] // 32-byte Folded Reload +; CHECK64-NEXT: bl __getf2 +; CHECK64-NEXT: tbz w22, #0, .LBB27_8 +; CHECK64-NEXT: // %bb.7: +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB27_8: +; CHECK64-NEXT: cmp w0, #0 +; CHECK64-NEXT: cset w8, ge +; CHECK64-NEXT: tst w8, w21 +; CHECK64-NEXT: csel w0, w20, w19, ne +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: ldp x20, x19, [sp, #296] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x22, x21, [sp, #280] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #176] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d15, d14, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #320 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w20 +; CHECK64-NEXT: .cfi_restore w21 +; CHECK64-NEXT: .cfi_restore w22 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: .cfi_restore b8 +; CHECK64-NEXT: .cfi_restore b9 +; CHECK64-NEXT: .cfi_restore b10 +; CHECK64-NEXT: .cfi_restore b11 +; CHECK64-NEXT: .cfi_restore b12 +; CHECK64-NEXT: .cfi_restore b13 +; CHECK64-NEXT: .cfi_restore b14 +; CHECK64-NEXT: .cfi_restore b15 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: f128_libcall: +; CHECK1024: // %bb.0: +; CHECK1024-NEXT: sub sp, sp, #1152 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1152 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x22, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x21, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1136] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -16 +; CHECK1024-NEXT: .cfi_offset w20, -24 +; CHECK1024-NEXT: .cfi_offset w21, -32 +; CHECK1024-NEXT: .cfi_offset w22, -40 +; CHECK1024-NEXT: .cfi_offset w30, -56 +; CHECK1024-NEXT: .cfi_offset w29, -64 +; CHECK1024-NEXT: .cfi_offset b8, -1096 +; CHECK1024-NEXT: .cfi_offset b9, -1104 +; CHECK1024-NEXT: .cfi_offset b10, -1112 +; CHECK1024-NEXT: .cfi_offset b11, -1120 +; CHECK1024-NEXT: .cfi_offset b12, -1128 +; CHECK1024-NEXT: .cfi_offset b13, -1136 +; CHECK1024-NEXT: .cfi_offset b14, -1144 +; CHECK1024-NEXT: .cfi_offset b15, -1152 +; CHECK1024-NEXT: sub sp, sp, #1088 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2240 +; CHECK1024-NEXT: mov w19, w1 +; CHECK1024-NEXT: mov w20, w0 +; CHECK1024-NEXT: str q3, [sp, #1072] // 16-byte Folded Spill +; CHECK1024-NEXT: str q2, [sp, #1056] // 16-byte Folded Spill +; CHECK1024-NEXT: str q1, [sp, #1040] // 16-byte Folded Spill +; CHECK1024-NEXT: str q0, [sp, #1024] // 16-byte Folded Spill +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x21, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -48 +; CHECK1024-NEXT: tbz w21, #0, .LBB27_2 +; CHECK1024-NEXT: // %bb.1: +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB27_2: +; CHECK1024-NEXT: ldr q0, [sp, #1024] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr q1, [sp, #1040] // 16-byte Folded Reload +; CHECK1024-NEXT: bl __lttf2 +; CHECK1024-NEXT: tbz w21, #0, .LBB27_4 +; CHECK1024-NEXT: // %bb.3: +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB27_4: +; CHECK1024-NEXT: cmp w0, #0 +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: cset w21, lt +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x22, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -48 +; CHECK1024-NEXT: tbz w22, #0, .LBB27_6 +; CHECK1024-NEXT: // %bb.5: +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB27_6: +; CHECK1024-NEXT: ldr q0, [sp, #1056] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr q1, [sp, #1072] // 16-byte Folded Reload +; CHECK1024-NEXT: bl __getf2 +; CHECK1024-NEXT: tbz w22, #0, .LBB27_8 +; CHECK1024-NEXT: // %bb.7: +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB27_8: +; CHECK1024-NEXT: cmp w0, #0 +; CHECK1024-NEXT: cset w8, ge +; CHECK1024-NEXT: tst w8, w21 +; CHECK1024-NEXT: csel w0, w20, w19, ne +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: add sp, sp, #1088 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1152 +; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1136] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x21, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x22, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1152 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w20 +; CHECK1024-NEXT: .cfi_restore w21 +; CHECK1024-NEXT: .cfi_restore w22 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: .cfi_restore b8 +; CHECK1024-NEXT: .cfi_restore b9 +; CHECK1024-NEXT: .cfi_restore b10 +; CHECK1024-NEXT: .cfi_restore b11 +; CHECK1024-NEXT: .cfi_restore b12 +; CHECK1024-NEXT: .cfi_restore b13 +; CHECK1024-NEXT: .cfi_restore b14 +; CHECK1024-NEXT: .cfi_restore b15 +; CHECK1024-NEXT: ret + %c0 = fcmp olt fp128 %v0, %v1 + %c1 = fcmp oge fp128 %v2, %v3 + %cr = and i1 %c1, %c0 + %sel = select i1 %cr, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_call: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w27, -16 +; CHECK0-NEXT: .cfi_offset w28, -24 +; CHECK0-NEXT: .cfi_offset w30, -40 +; CHECK0-NEXT: .cfi_offset w29, -48 +; CHECK0-NEXT: addvl sp, sp, #-18 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x19, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -32 +; CHECK0-NEXT: tbz w19, #0, .LBB28_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB28_2: // %entry +; CHECK0-NEXT: mov x0, x8 +; CHECK0-NEXT: mov w1, #45 // =0x2d +; CHECK0-NEXT: mov w2, #37 // =0x25 +; CHECK0-NEXT: bl memset +; CHECK0-NEXT: tbz w19, #0, .LBB28_4 +; CHECK0-NEXT: // %bb.3: // %entry +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB28_4: // %entry +; CHECK0-NEXT: mov w0, #22647 // =0x5877 +; CHECK0-NEXT: movk w0, #59491, lsl #16 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #18 +; CHECK0-NEXT: .cfi_def_cfa wsp, 48 +; CHECK0-NEXT: .cfi_restore z8 +; CHECK0-NEXT: .cfi_restore z9 +; CHECK0-NEXT: .cfi_restore z10 +; CHECK0-NEXT: .cfi_restore z11 +; CHECK0-NEXT: .cfi_restore z12 +; CHECK0-NEXT: .cfi_restore z13 +; CHECK0-NEXT: .cfi_restore z14 +; CHECK0-NEXT: .cfi_restore z15 +; CHECK0-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w27 +; CHECK0-NEXT: .cfi_restore w28 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_call: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 112 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: stp x27, x19, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w27, -16 +; CHECK64-NEXT: .cfi_offset w28, -24 +; CHECK64-NEXT: .cfi_offset w30, -40 +; CHECK64-NEXT: .cfi_offset w29, -48 +; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 112 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 112 - 16 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 112 - 24 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 112 - 32 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 112 - 40 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 112 - 48 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 112 - 56 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 112 - 64 * VG +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x01, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 176 + 144 * VG +; CHECK64-NEXT: mov x8, x0 +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x19, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -32 +; CHECK64-NEXT: tbz w19, #0, .LBB28_2 +; CHECK64-NEXT: // %bb.1: // %entry +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB28_2: // %entry +; CHECK64-NEXT: mov x0, x8 +; CHECK64-NEXT: mov w1, #45 // =0x2d +; CHECK64-NEXT: mov w2, #37 // =0x25 +; CHECK64-NEXT: bl memset +; CHECK64-NEXT: tbz w19, #0, .LBB28_4 +; CHECK64-NEXT: // %bb.3: // %entry +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB28_4: // %entry +; CHECK64-NEXT: mov w0, #22647 // =0x5877 +; CHECK64-NEXT: movk w0, #59491, lsl #16 +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: add sp, sp, #64 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #18 +; CHECK64-NEXT: .cfi_def_cfa wsp, 112 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 +; CHECK64-NEXT: ldp x27, x19, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w27 +; CHECK64-NEXT: .cfi_restore w28 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_call: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1072 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w27, -16 +; CHECK1024-NEXT: .cfi_offset w28, -24 +; CHECK1024-NEXT: .cfi_offset w30, -40 +; CHECK1024-NEXT: .cfi_offset w29, -48 +; CHECK1024-NEXT: addvl sp, sp, #-18 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 16 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 24 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 32 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 40 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 48 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 56 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG +; CHECK1024-NEXT: mov x8, x0 +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x19, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -32 +; CHECK1024-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-NEXT: // %bb.1: // %entry +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB28_2: // %entry +; CHECK1024-NEXT: mov x0, x8 +; CHECK1024-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NEXT: bl memset +; CHECK1024-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-NEXT: // %bb.3: // %entry +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB28_4: // %entry +; CHECK1024-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #18 +; CHECK1024-NEXT: .cfi_def_cfa wsp, 1072 +; CHECK1024-NEXT: .cfi_restore z8 +; CHECK1024-NEXT: .cfi_restore z9 +; CHECK1024-NEXT: .cfi_restore z10 +; CHECK1024-NEXT: .cfi_restore z11 +; CHECK1024-NEXT: .cfi_restore z12 +; CHECK1024-NEXT: .cfi_restore z13 +; CHECK1024-NEXT: .cfi_restore z14 +; CHECK1024-NEXT: .cfi_restore z15 +; CHECK1024-NEXT: ldr x19, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w27 +; CHECK1024-NEXT: .cfi_restore w28 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} + +define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_alloca_call: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w27, -16 +; CHECK0-NEXT: .cfi_offset w28, -24 +; CHECK0-NEXT: .cfi_offset w30, -40 +; CHECK0-NEXT: .cfi_offset w29, -48 +; CHECK0-NEXT: addvl sp, sp, #-18 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG +; CHECK0-NEXT: sub sp, sp, #48 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 96 + 144 * VG +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: and x19, x0, #0x1 +; CHECK0-NEXT: .cfi_offset vg, -32 +; CHECK0-NEXT: tbz w19, #0, .LBB29_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB29_2: // %entry +; CHECK0-NEXT: mov x0, sp +; CHECK0-NEXT: mov w1, #45 // =0x2d +; CHECK0-NEXT: mov w2, #37 // =0x25 +; CHECK0-NEXT: bl memset +; CHECK0-NEXT: tbz w19, #0, .LBB29_4 +; CHECK0-NEXT: // %bb.3: // %entry +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB29_4: // %entry +; CHECK0-NEXT: mov w0, #22647 // =0x5877 +; CHECK0-NEXT: movk w0, #59491, lsl #16 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: add sp, sp, #48 +; CHECK0-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: addvl sp, sp, #18 +; CHECK0-NEXT: .cfi_def_cfa wsp, 48 +; CHECK0-NEXT: .cfi_restore z8 +; CHECK0-NEXT: .cfi_restore z9 +; CHECK0-NEXT: .cfi_restore z10 +; CHECK0-NEXT: .cfi_restore z11 +; CHECK0-NEXT: .cfi_restore z12 +; CHECK0-NEXT: .cfi_restore z13 +; CHECK0-NEXT: .cfi_restore z14 +; CHECK0-NEXT: .cfi_restore z15 +; CHECK0-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w27 +; CHECK0-NEXT: .cfi_restore w28 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_alloca_call: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 112 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: stp x27, x19, [sp, #96] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w27, -16 +; CHECK64-NEXT: .cfi_offset w28, -24 +; CHECK64-NEXT: .cfi_offset w30, -40 +; CHECK64-NEXT: .cfi_offset w29, -48 +; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 112 - 8 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 112 - 16 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 112 - 24 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 112 - 32 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 112 - 40 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 112 - 48 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 112 - 56 * VG +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 112 - 64 * VG +; CHECK64-NEXT: sub sp, sp, #112 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 224 + 144 * VG +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: bl __arm_sme_state +; CHECK64-NEXT: and x19, x0, #0x1 +; CHECK64-NEXT: .cfi_offset vg, -32 +; CHECK64-NEXT: tbz w19, #0, .LBB29_2 +; CHECK64-NEXT: // %bb.1: // %entry +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: .LBB29_2: // %entry +; CHECK64-NEXT: mov x0, sp +; CHECK64-NEXT: mov w1, #45 // =0x2d +; CHECK64-NEXT: mov w2, #37 // =0x25 +; CHECK64-NEXT: bl memset +; CHECK64-NEXT: tbz w19, #0, .LBB29_4 +; CHECK64-NEXT: // %bb.3: // %entry +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .LBB29_4: // %entry +; CHECK64-NEXT: mov w0, #22647 // =0x5877 +; CHECK64-NEXT: movk w0, #59491, lsl #16 +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: add sp, sp, #112 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xf0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 112 + 144 * VG +; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK64-NEXT: addvl sp, sp, #18 +; CHECK64-NEXT: .cfi_def_cfa wsp, 112 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 +; CHECK64-NEXT: ldp x27, x19, [sp, #96] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #112 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w27 +; CHECK64-NEXT: .cfi_restore w28 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: svecc_alloca_call: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1072 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w27, -16 +; CHECK1024-NEXT: .cfi_offset w28, -24 +; CHECK1024-NEXT: .cfi_offset w30, -40 +; CHECK1024-NEXT: .cfi_offset w29, -48 +; CHECK1024-NEXT: addvl sp, sp, #-18 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 8 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 16 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 24 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 32 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 40 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 48 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 56 * VG +; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG +; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2144 + 144 * VG +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: bl __arm_sme_state +; CHECK1024-NEXT: and x19, x0, #0x1 +; CHECK1024-NEXT: .cfi_offset vg, -32 +; CHECK1024-NEXT: tbz w19, #0, .LBB29_2 +; CHECK1024-NEXT: // %bb.1: // %entry +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: .LBB29_2: // %entry +; CHECK1024-NEXT: mov x0, sp +; CHECK1024-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NEXT: bl memset +; CHECK1024-NEXT: tbz w19, #0, .LBB29_4 +; CHECK1024-NEXT: // %bb.3: // %entry +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .LBB29_4: // %entry +; CHECK1024-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NEXT: addvl sp, sp, #18 +; CHECK1024-NEXT: .cfi_def_cfa wsp, 1072 +; CHECK1024-NEXT: .cfi_restore z8 +; CHECK1024-NEXT: .cfi_restore z9 +; CHECK1024-NEXT: .cfi_restore z10 +; CHECK1024-NEXT: .cfi_restore z11 +; CHECK1024-NEXT: .cfi_restore z12 +; CHECK1024-NEXT: .cfi_restore z13 +; CHECK1024-NEXT: .cfi_restore z14 +; CHECK1024-NEXT: .cfi_restore z15 +; CHECK1024-NEXT: ldr x19, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w27 +; CHECK1024-NEXT: .cfi_restore w28 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %0 = alloca [37 x i8], align 16 + %call = call ptr @memset(ptr noundef nonnull %0, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) + +define void @call_with_doubles() "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: call_with_doubles: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK0-NEXT: str x30, [sp, #8] // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 16 +; CHECK0-NEXT: .cfi_offset w30, -8 +; CHECK0-NEXT: .cfi_offset b8, -16 +; CHECK0-NEXT: mov x8, #9221120237041090560 // =0x7ff8000000000000 +; CHECK0-NEXT: fmov d8, x8 +; CHECK0-NEXT: fmov d0, d8 +; CHECK0-NEXT: bl calld +; CHECK0-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK0-NEXT: fmov d0, d8 +; CHECK0-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK0-NEXT: b calld +; +; CHECK64-LABEL: call_with_doubles: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #144 +; CHECK64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; CHECK64-NEXT: str x30, [sp, #136] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 144 +; CHECK64-NEXT: .cfi_offset w30, -8 +; CHECK64-NEXT: .cfi_offset b8, -80 +; CHECK64-NEXT: mov x8, #9221120237041090560 // =0x7ff8000000000000 +; CHECK64-NEXT: fmov d8, x8 +; CHECK64-NEXT: fmov d0, d8 +; CHECK64-NEXT: bl calld +; CHECK64-NEXT: fmov d0, d8 +; CHECK64-NEXT: ldr x30, [sp, #136] // 8-byte Folded Reload +; CHECK64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #144 +; CHECK64-NEXT: b calld +; +; CHECK1024-LABEL: call_with_doubles: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2080 +; CHECK1024-NEXT: .cfi_offset w30, -16 +; CHECK1024-NEXT: .cfi_offset w29, -24 +; CHECK1024-NEXT: .cfi_offset b8, -1056 +; CHECK1024-NEXT: mov x8, #9221120237041090560 // =0x7ff8000000000000 +; CHECK1024-NEXT: fmov d8, x8 +; CHECK1024-NEXT: fmov d0, d8 +; CHECK1024-NEXT: bl calld +; CHECK1024-NEXT: fmov d0, d8 +; CHECK1024-NEXT: add sp, sp, #1024 +; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr d8, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: b calld +entry: + %call = tail call i32 @calld(double 0x7FF8000000000000) + %call.1 = tail call i32 @calld(double 0x7FF8000000000000) + ret void +} +declare i32 @calld(double) "aarch64_pstate_sm_compatible" + +; Check that stack objects are ordererd fpr > hazard > gpr +define void @ordering_test(double %d, half %h, <4 x i32> %v) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: ordering_test: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #48 +; CHECK0-NEXT: .cfi_def_cfa_offset 48 +; CHECK0-NEXT: str wzr, [sp, #32] +; CHECK0-NEXT: str d0, [sp, #24] +; CHECK0-NEXT: str wzr, [sp, #44] +; CHECK0-NEXT: str h1, [sp, #22] +; CHECK0-NEXT: str wzr, [sp, #16] +; CHECK0-NEXT: str q2, [sp], #48 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: ordering_test: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #128 +; CHECK64-NEXT: .cfi_def_cfa_offset 128 +; CHECK64-NEXT: stp wzr, wzr, [sp, #12] +; CHECK64-NEXT: str d0, [sp, #120] +; CHECK64-NEXT: str wzr, [sp, #28] +; CHECK64-NEXT: str h1, [sp, #118] +; CHECK64-NEXT: str q2, [sp, #96] +; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: ordering_test: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1088 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2128 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: stp wzr, wzr, [sp, #12] +; CHECK1024-NEXT: str d0, [sp, #1080] +; CHECK1024-NEXT: str wzr, [sp, #28] +; CHECK1024-NEXT: str h1, [sp, #1078] +; CHECK1024-NEXT: str q2, [sp, #1056] +; CHECK1024-NEXT: add sp, sp, #1088 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %i32 = alloca i32 + %i64 = alloca i64 + %f64 = alloca double + %f16 = alloca half + %i32b = alloca i32 + %v4i32 = alloca <4 x i32> + store i32 0, ptr %i64 + store double %d, ptr %f64 + store i32 0, ptr %i32 + store half %h, ptr %f16 + store i32 0, ptr %i32b + store <4 x i32> %v, ptr %v4i32 + ret void +} + + +define void @ordering_test_array(i64 %o, i64 %p, float %f, i32 %x) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: ordering_test_array: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: sub sp, sp, #272 +; CHECK0-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 272 +; CHECK0-NEXT: .cfi_offset w29, -16 +; CHECK0-NEXT: add x8, sp, #128 +; CHECK0-NEXT: str w2, [x8, x0, lsl #2] +; CHECK0-NEXT: mov x8, sp +; CHECK0-NEXT: str s0, [x8, x1, lsl #2] +; CHECK0-NEXT: add sp, sp, #272 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: ordering_test_array: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #400 +; CHECK64-NEXT: str x29, [sp, #384] // 8-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 400 +; CHECK64-NEXT: .cfi_offset w29, -16 +; CHECK64-NEXT: mov x8, sp +; CHECK64-NEXT: str w2, [x8, x0, lsl #2] +; CHECK64-NEXT: add x8, sp, #192 +; CHECK64-NEXT: str s0, [x8, x1, lsl #2] +; CHECK64-NEXT: add sp, sp, #400 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: ordering_test_array: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1280 +; CHECK1024-NEXT: .cfi_def_cfa_offset 2320 +; CHECK1024-NEXT: .cfi_offset w29, -16 +; CHECK1024-NEXT: mov x8, sp +; CHECK1024-NEXT: str w2, [x8, x0, lsl #2] +; CHECK1024-NEXT: add x8, sp, #1152 +; CHECK1024-NEXT: str s0, [x8, x1, lsl #2] +; CHECK1024-NEXT: add sp, sp, #1280 +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: ret +entry: + %i32 = alloca [32 x i32] + %f32 = alloca [32 x float] + %g = getelementptr i32, ptr %i32, i64 %o + store i32 %x, ptr %g + %h = getelementptr float, ptr %f32, i64 %p + store float %f, ptr %h + ret void +} + +; The VA register currently ends up in VLA space. Lets hope that doesn't come up very often. +define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" { +; CHECK0-LABEL: vastate: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 112 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK0-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK0-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK0-NEXT: add x29, sp, #64 +; CHECK0-NEXT: .cfi_def_cfa w29, 48 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w20, -16 +; CHECK0-NEXT: .cfi_offset w30, -40 +; CHECK0-NEXT: .cfi_offset w29, -48 +; CHECK0-NEXT: .cfi_offset b8, -56 +; CHECK0-NEXT: .cfi_offset b9, -64 +; CHECK0-NEXT: .cfi_offset b10, -72 +; CHECK0-NEXT: .cfi_offset b11, -80 +; CHECK0-NEXT: .cfi_offset b12, -88 +; CHECK0-NEXT: .cfi_offset b13, -96 +; CHECK0-NEXT: .cfi_offset b14, -104 +; CHECK0-NEXT: .cfi_offset b15, -112 +; CHECK0-NEXT: sub sp, sp, #16 +; CHECK0-NEXT: rdsvl x8, #1 +; CHECK0-NEXT: mov x9, sp +; CHECK0-NEXT: mov w20, w0 +; CHECK0-NEXT: msub x9, x8, x8, x9 +; CHECK0-NEXT: mov sp, x9 +; CHECK0-NEXT: stur x9, [x29, #-80] +; CHECK0-NEXT: sub x9, x29, #80 +; CHECK0-NEXT: sturh wzr, [x29, #-70] +; CHECK0-NEXT: stur wzr, [x29, #-68] +; CHECK0-NEXT: sturh w8, [x29, #-72] +; CHECK0-NEXT: msr TPIDR2_EL0, x9 +; CHECK0-NEXT: .cfi_offset vg, -32 +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: bl other +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: smstart za +; CHECK0-NEXT: mrs x8, TPIDR2_EL0 +; CHECK0-NEXT: sub x0, x29, #80 +; CHECK0-NEXT: cbnz x8, .LBB33_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: bl __arm_tpidr2_restore +; CHECK0-NEXT: .LBB33_2: // %entry +; CHECK0-NEXT: mov w0, w20 +; CHECK0-NEXT: msr TPIDR2_EL0, xzr +; CHECK0-NEXT: sub sp, x29, #64 +; CHECK0-NEXT: .cfi_def_cfa wsp, 112 +; CHECK0-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w20 +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: .cfi_restore b8 +; CHECK0-NEXT: .cfi_restore b9 +; CHECK0-NEXT: .cfi_restore b10 +; CHECK0-NEXT: .cfi_restore b11 +; CHECK0-NEXT: .cfi_restore b12 +; CHECK0-NEXT: .cfi_restore b13 +; CHECK0-NEXT: .cfi_restore b14 +; CHECK0-NEXT: .cfi_restore b15 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: vastate: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: stp d15, d14, [sp, #-176]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 176 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK64-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x20, [sp, #144] // 16-byte Folded Spill +; CHECK64-NEXT: str x19, [sp, #160] // 8-byte Folded Spill +; CHECK64-NEXT: mov x29, sp +; CHECK64-NEXT: .cfi_def_cfa w29, 176 +; CHECK64-NEXT: .cfi_offset w19, -16 +; CHECK64-NEXT: .cfi_offset w20, -24 +; CHECK64-NEXT: .cfi_offset w30, -40 +; CHECK64-NEXT: .cfi_offset w29, -48 +; CHECK64-NEXT: .cfi_offset b8, -120 +; CHECK64-NEXT: .cfi_offset b9, -128 +; CHECK64-NEXT: .cfi_offset b10, -136 +; CHECK64-NEXT: .cfi_offset b11, -144 +; CHECK64-NEXT: .cfi_offset b12, -152 +; CHECK64-NEXT: .cfi_offset b13, -160 +; CHECK64-NEXT: .cfi_offset b14, -168 +; CHECK64-NEXT: .cfi_offset b15, -176 +; CHECK64-NEXT: sub sp, sp, #80 +; CHECK64-NEXT: rdsvl x8, #1 +; CHECK64-NEXT: mov x9, sp +; CHECK64-NEXT: mov w20, w0 +; CHECK64-NEXT: msub x9, x8, x8, x9 +; CHECK64-NEXT: mov sp, x9 +; CHECK64-NEXT: stur x9, [x29, #-80] +; CHECK64-NEXT: sub x9, x29, #80 +; CHECK64-NEXT: sturh wzr, [x29, #-70] +; CHECK64-NEXT: stur wzr, [x29, #-68] +; CHECK64-NEXT: sturh w8, [x29, #-72] +; CHECK64-NEXT: msr TPIDR2_EL0, x9 +; CHECK64-NEXT: .cfi_offset vg, -32 +; CHECK64-NEXT: smstop sm +; CHECK64-NEXT: bl other +; CHECK64-NEXT: smstart sm +; CHECK64-NEXT: .cfi_restore vg +; CHECK64-NEXT: smstart za +; CHECK64-NEXT: mrs x8, TPIDR2_EL0 +; CHECK64-NEXT: sub x0, x29, #80 +; CHECK64-NEXT: cbnz x8, .LBB33_2 +; CHECK64-NEXT: // %bb.1: // %entry +; CHECK64-NEXT: bl __arm_tpidr2_restore +; CHECK64-NEXT: .LBB33_2: // %entry +; CHECK64-NEXT: mov w0, w20 +; CHECK64-NEXT: msr TPIDR2_EL0, xzr +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 176 +; CHECK64-NEXT: ldp x20, x19, [sp, #152] // 16-byte Folded Reload +; CHECK64-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK64-NEXT: ldr d15, [sp], #176 // 8-byte Folded Reload +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: .cfi_restore w19 +; CHECK64-NEXT: .cfi_restore w20 +; CHECK64-NEXT: .cfi_restore w30 +; CHECK64-NEXT: .cfi_restore w29 +; CHECK64-NEXT: .cfi_restore b8 +; CHECK64-NEXT: .cfi_restore b9 +; CHECK64-NEXT: .cfi_restore b10 +; CHECK64-NEXT: .cfi_restore b11 +; CHECK64-NEXT: .cfi_restore b12 +; CHECK64-NEXT: .cfi_restore b13 +; CHECK64-NEXT: .cfi_restore b14 +; CHECK64-NEXT: .cfi_restore b15 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: vastate: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1136 +; CHECK1024-NEXT: .cfi_def_cfa_offset 1136 +; CHECK1024-NEXT: cntd x9 +; CHECK1024-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK1024-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK1024-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK1024-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK1024-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK1024-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill +; CHECK1024-NEXT: mov x29, sp +; CHECK1024-NEXT: .cfi_def_cfa w29, 1136 +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w20, -16 +; CHECK1024-NEXT: .cfi_offset w28, -24 +; CHECK1024-NEXT: .cfi_offset w30, -40 +; CHECK1024-NEXT: .cfi_offset w29, -48 +; CHECK1024-NEXT: .cfi_offset b8, -1080 +; CHECK1024-NEXT: .cfi_offset b9, -1088 +; CHECK1024-NEXT: .cfi_offset b10, -1096 +; CHECK1024-NEXT: .cfi_offset b11, -1104 +; CHECK1024-NEXT: .cfi_offset b12, -1112 +; CHECK1024-NEXT: .cfi_offset b13, -1120 +; CHECK1024-NEXT: .cfi_offset b14, -1128 +; CHECK1024-NEXT: .cfi_offset b15, -1136 +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: rdsvl x8, #1 +; CHECK1024-NEXT: mov x9, sp +; CHECK1024-NEXT: mov w20, w0 +; CHECK1024-NEXT: msub x9, x8, x8, x9 +; CHECK1024-NEXT: mov sp, x9 +; CHECK1024-NEXT: sub x10, x29, #784 +; CHECK1024-NEXT: stur x9, [x10, #-256] +; CHECK1024-NEXT: sub x9, x29, #774 +; CHECK1024-NEXT: sub x10, x29, #772 +; CHECK1024-NEXT: sturh wzr, [x9, #-256] +; CHECK1024-NEXT: sub x9, x29, #1040 +; CHECK1024-NEXT: stur wzr, [x10, #-256] +; CHECK1024-NEXT: sub x10, x29, #776 +; CHECK1024-NEXT: sturh w8, [x10, #-256] +; CHECK1024-NEXT: msr TPIDR2_EL0, x9 +; CHECK1024-NEXT: .cfi_offset vg, -32 +; CHECK1024-NEXT: smstop sm +; CHECK1024-NEXT: bl other +; CHECK1024-NEXT: smstart sm +; CHECK1024-NEXT: .cfi_restore vg +; CHECK1024-NEXT: smstart za +; CHECK1024-NEXT: mrs x8, TPIDR2_EL0 +; CHECK1024-NEXT: sub x0, x29, #1040 +; CHECK1024-NEXT: cbnz x8, .LBB33_2 +; CHECK1024-NEXT: // %bb.1: // %entry +; CHECK1024-NEXT: bl __arm_tpidr2_restore +; CHECK1024-NEXT: .LBB33_2: // %entry +; CHECK1024-NEXT: mov w0, w20 +; CHECK1024-NEXT: msr TPIDR2_EL0, xzr +; CHECK1024-NEXT: mov sp, x29 +; CHECK1024-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK1024-NEXT: ldr x20, [sp, #1120] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK1024-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK1024-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1136 +; CHECK1024-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NEXT: .cfi_restore w19 +; CHECK1024-NEXT: .cfi_restore w20 +; CHECK1024-NEXT: .cfi_restore w28 +; CHECK1024-NEXT: .cfi_restore w30 +; CHECK1024-NEXT: .cfi_restore w29 +; CHECK1024-NEXT: .cfi_restore b8 +; CHECK1024-NEXT: .cfi_restore b9 +; CHECK1024-NEXT: .cfi_restore b10 +; CHECK1024-NEXT: .cfi_restore b11 +; CHECK1024-NEXT: .cfi_restore b12 +; CHECK1024-NEXT: .cfi_restore b13 +; CHECK1024-NEXT: .cfi_restore b14 +; CHECK1024-NEXT: .cfi_restore b15 +; CHECK1024-NEXT: ret +entry: + tail call void @other() + ret i32 %x +} +declare void @other()