diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index ade175d99c89a..522d8513c9aff 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -343,6 +343,10 @@ def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true [FeatureAVX10_1, FeatureEVEX512]>; def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true", "Support extended general purpose register">; +def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true", + "Support PUSH2/POP2 instructions">; +def FeaturePPX : SubtargetFeature<"ppx", "HasPPX", "true", + "Support Push-Pop Acceleration">; // Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka // "string operations"). See "REP String Enhancement" in the Intel Software diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index b042f6865f40d..c0d358ead2787 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -41,6 +41,7 @@ STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue"); STATISTIC(NumFrameExtraProbe, "Number of extra stack probes generated in prologue"); +STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2"); using namespace llvm; @@ -139,6 +140,38 @@ static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) { return X86::MOV32ri; } +// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the +// value written by the PUSH from the stack. The processor tracks these marked +// instructions internally and fast-forwards register data between matching PUSH +// and POP instructions, without going through memory or through the training +// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient +// memory-renaming optimization can be used. +// +// The PPX hint is purely a performance hint. Instructions with this hint have +// the same functional semantics as those without. PPX hints set by the +// compiler that violate the balancing rule may turn off the PPX optimization, +// but they will not affect program semantics. +// +// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp +// are not considered). +// +// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2 +// GPRs at a time to/from the stack. +static unsigned getPUSHOpcode(const X86Subtarget &ST) { + return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r) + : X86::PUSH32r; +} +static unsigned getPOPOpcode(const X86Subtarget &ST) { + return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r) + : X86::POP32r; +} +static unsigned getPUSH2Opcode(const X86Subtarget &ST) { + return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2; +} +static unsigned getPOP2Opcode(const X86Subtarget &ST) { + return ST.hasPPX() ? X86::POP2P : X86::POP2; +} + static bool isEAXLiveIn(MachineBasicBlock &MBB) { for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) { unsigned Reg = RegMask.PhysReg; @@ -1679,7 +1712,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = alignTo(NumBytes, MaxAlign); // Save EBP/RBP into the appropriate stack slot. - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + BuildMI(MBB, MBBI, DL, + TII.get(getPUSHOpcode(MF.getSubtarget()))) .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); @@ -1818,18 +1852,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; + MachineBasicBlock::const_iterator LastCSPush = MBBI; + auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) { + if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup)) + return false; + unsigned Opc = MBBI->getOpcode(); + return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r || + Opc == X86::PUSH2 || Opc == X86::PUSH2P; + }; - while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) && - (MBBI->getOpcode() == X86::PUSH32r || - MBBI->getOpcode() == X86::PUSH64r)) { + while (IsCSPush(MBBI)) { PushedRegs = true; Register Reg = MBBI->getOperand(0).getReg(); + LastCSPush = MBBI; ++MBBI; + unsigned Opc = LastCSPush->getOpcode(); if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); + // Compared to push, push2 introduces more stack offset (one more + // register). + if (Opc == X86::PUSH2 || Opc == X86::PUSH2P) + StackOffset += stackGrowth; BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset), MachineInstr::FrameSetup); @@ -1841,6 +1887,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(Reg) .setMIFlag(MachineInstr::FrameSetup); + if (Opc == X86::PUSH2 || Opc == X86::PUSH2P) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(LastCSPush->getOperand(1).getReg()) + .setMIFlag(MachineInstr::FrameSetup); } } @@ -2317,7 +2367,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true); } // Pop EBP. - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + BuildMI(MBB, MBBI, DL, + TII.get(getPOPOpcode(MF.getSubtarget())), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); @@ -2357,10 +2408,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned Opc = PI->getOpcode(); if (Opc != X86::DBG_VALUE && !PI->isTerminator()) { - if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && - (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && - (Opc != X86::BTR64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)) && - (Opc != X86::ADD64ri32 || !PI->getFlag(MachineInstr::FrameDestroy))) + if (!PI->getFlag(MachineInstr::FrameDestroy) || + (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 && + Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 && + Opc != X86::POP2P && Opc != X86::LEA64r)) break; FirstCSPop = PI; } @@ -2451,8 +2502,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator PI = MBBI; unsigned Opc = PI->getOpcode(); ++MBBI; - if (Opc == X86::POP32r || Opc == X86::POP64r) { + if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r || + Opc == X86::POP2 || Opc == X86::POP2P) { Offset += SlotSize; + // Compared to pop, pop2 introduces more stack offset (one more + // register). + if (Opc == X86::POP2 || Opc == X86::POP2P) + Offset += SlotSize; BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset), MachineInstr::FrameDestroy); @@ -2735,6 +2791,30 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( } } + // Strategy: + // 1. Use push2 when + // a) number of CSR > 1 if no need padding + // b) number of CSR > 2 if need padding + // 2. When the number of CSR push is odd + // a. Start to use push2 from the 1st push if stack is 16B aligned. + // b. Start to use push2 from the 2nd push if stack is not 16B aligned. + // 3. When the number of CSR push is even, start to use push2 from the 1st + // push and make the stack 16B aligned before the push + unsigned NumRegsForPush2 = 0; + if (STI.hasPush2Pop2()) { + unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) { + return X86::GR64RegClass.contains(I.getReg()); + }); + bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0); + bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1; + X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2); + NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0; + if (X86FI->padForPush2Pop2()) { + SpillSlotOffset -= SlotSize; + MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + } + } + // Assign slots for GPRs. It increases frame size. for (CalleeSavedInfo &I : llvm::reverse(CSI)) { Register Reg = I.getReg(); @@ -2742,6 +2822,13 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; + // A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned + // or only an odd number of registers in the candidates. + if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 && + (SpillSlotOffset % 16 == 0 || + X86FI->getNumCandidatesForPush2Pop2() % 2)) + X86FI->addCandidateForPush2Pop2(Reg); + SpillSlotOffset -= SlotSize; CalleeSavedFrameSize += SlotSize; @@ -2759,6 +2846,10 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // TODO: saving the slot index is better? X86FI->setRestoreBasePointer(CalleeSavedFrameSize); } + assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 && + "Expect even candidates for push2/pop2"); + if (X86FI->getNumCandidatesForPush2Pop2()) + ++NumFunctionUsingPush2Pop2; X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize); @@ -2808,41 +2899,50 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // Push GPRs. It increases frame size. const MachineFunction &MF = *MBB.getParent(); - unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; - for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { - Register Reg = I.getReg(); - - if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) - continue; + const X86MachineFunctionInfo *X86FI = MF.getInfo(); + if (X86FI->padForPush2Pop2()) + emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false); + // Update LiveIn of the basic block and decide whether we can add a kill flag + // to the use. + auto UpdateLiveInCheckCanKill = [&](Register Reg) { const MachineRegisterInfo &MRI = MF.getRegInfo(); - bool isLiveIn = MRI.isLiveIn(Reg); - if (!isLiveIn) - MBB.addLiveIn(Reg); - - // Decide whether we can add a kill flag to the use. - bool CanKill = !isLiveIn; - // Check if any subregister is live-in - if (CanKill) { - for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) { - if (MRI.isLiveIn(*AReg)) { - CanKill = false; - break; - } - } - } - // Do not set a kill flag on values that are also marked as live-in. This // happens with the @llvm-returnaddress intrinsic and with arguments // passed in callee saved registers. // Omitting the kill flags is conservatively correct even if the live-in // is not used after all. - BuildMI(MBB, MI, DL, TII.get(Opc)) - .addReg(Reg, getKillRegState(CanKill)) - .setMIFlag(MachineInstr::FrameSetup); + if (MRI.isLiveIn(Reg)) + return false; + MBB.addLiveIn(Reg); + // Check if any subregister is live-in + for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) + if (MRI.isLiveIn(*AReg)) + return false; + return true; + }; + auto UpdateLiveInGetKillRegState = [&](Register Reg) { + return getKillRegState(UpdateLiveInCheckCanKill(Reg)); + }; + + for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) { + Register Reg = RI->getReg(); + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + + if (X86FI->isCandidateForPush2Pop2(Reg)) { + Register Reg2 = (++RI)->getReg(); + BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI))) + .addReg(Reg, UpdateLiveInGetKillRegState(Reg)) + .addReg(Reg2, UpdateLiveInGetKillRegState(Reg2)) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI))) + .addReg(Reg, UpdateLiveInGetKillRegState(Reg)) + .setMIFlag(MachineInstr::FrameSetup); + } } - const X86MachineFunctionInfo *X86FI = MF.getInfo(); if (X86FI->getRestoreBasePointer()) { unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; Register BaseReg = this->TRI->getBaseRegister(); @@ -2958,15 +3058,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( } // POP GPRs. - unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; - for (const CalleeSavedInfo &I : CSI) { - Register Reg = I.getReg(); + for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) { + Register Reg = I->getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; - BuildMI(MBB, MI, DL, TII.get(Opc), Reg) - .setMIFlag(MachineInstr::FrameDestroy); + if (X86FI->isCandidateForPush2Pop2(Reg)) + BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg) + .addReg((++I)->getReg(), RegState::Define) + .setMIFlag(MachineInstr::FrameDestroy); + else + BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg) + .setMIFlag(MachineInstr::FrameDestroy); } + if (X86FI->padForPush2Pop2()) + emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true); + return true; } diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 9b2cc35c57e00..f6e853270e073 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -17,6 +17,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" +#include namespace llvm { @@ -117,6 +118,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// determine if we should insert tilerelease in frame lowering. bool HasVirtualTileReg = false; + /// Ajust stack for push2/pop2 + bool PadForPush2Pop2 = false; + + /// Candidate registers for push2/pop2 + std::set CandidatesForPush2Pop2; + /// True if this function has CFI directives that adjust the CFA. /// This is used to determine if we should direct the debugger to use /// the CFA instead of the stack pointer. @@ -165,7 +172,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { const DenseMap& getWinEHXMMSlotInfo() const { return WinEHXMMSlotInfo; } - unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } + unsigned getCalleeSavedFrameSize() const { + return CalleeSavedFrameSize + 8 * padForPush2Pop2(); + } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } @@ -232,6 +241,19 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { bool hasVirtualTileReg() const { return HasVirtualTileReg; } void setHasVirtualTileReg(bool v) { HasVirtualTileReg = v; } + bool padForPush2Pop2() const { return PadForPush2Pop2; } + void setPadForPush2Pop2(bool V) { PadForPush2Pop2 = V; } + + bool isCandidateForPush2Pop2(Register Reg) const { + return CandidatesForPush2Pop2.find(Reg) != CandidatesForPush2Pop2.end(); + } + void addCandidateForPush2Pop2(Register Reg) { + CandidatesForPush2Pop2.insert(Reg); + } + size_t getNumCandidatesForPush2Pop2() const { + return CandidatesForPush2Pop2.size(); + } + bool hasCFIAdjustCfa() const { return HasCFIAdjustCfa; } void setHasCFIAdjustCfa(bool v) { HasCFIAdjustCfa = v; } diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll new file mode 100644 index 0000000000000..6c9fdc2adce2f --- /dev/null +++ b/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll @@ -0,0 +1,217 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=LIN-REF +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+push2pop2 | FileCheck %s --check-prefix=LIN +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+push2pop2,+ppx | FileCheck %s --check-prefix=LIN-PPX +; RUN: llc < %s -mtriple=x86_64-windows-msvc | FileCheck %s --check-prefix=WIN-REF +; RUN: llc < %s -mtriple=x86_64-windows-msvc -mattr=+push2pop2 | FileCheck %s --check-prefix=WIN +; RUN: llc < %s -mtriple=x86_64-windows-msvc -mattr=+push2pop2,+ppx | FileCheck %s --check-prefix=WIN-PPX + +define i32 @csr6_alloc16(ptr %argv) { +; LIN-REF-LABEL: csr6_alloc16: +; LIN-REF: # %bb.0: # %entry +; LIN-REF-NEXT: pushq %rbp +; LIN-REF-NEXT: .cfi_def_cfa_offset 16 +; LIN-REF-NEXT: pushq %r15 +; LIN-REF-NEXT: .cfi_def_cfa_offset 24 +; LIN-REF-NEXT: pushq %r14 +; LIN-REF-NEXT: .cfi_def_cfa_offset 32 +; LIN-REF-NEXT: pushq %r13 +; LIN-REF-NEXT: .cfi_def_cfa_offset 40 +; LIN-REF-NEXT: pushq %r12 +; LIN-REF-NEXT: .cfi_def_cfa_offset 48 +; LIN-REF-NEXT: pushq %rbx +; LIN-REF-NEXT: .cfi_def_cfa_offset 56 +; LIN-REF-NEXT: subq $24, %rsp +; LIN-REF-NEXT: .cfi_def_cfa_offset 80 +; LIN-REF-NEXT: .cfi_offset %rbx, -56 +; LIN-REF-NEXT: .cfi_offset %r12, -48 +; LIN-REF-NEXT: .cfi_offset %r13, -40 +; LIN-REF-NEXT: .cfi_offset %r14, -32 +; LIN-REF-NEXT: .cfi_offset %r15, -24 +; LIN-REF-NEXT: .cfi_offset %rbp, -16 +; LIN-REF-NEXT: #APP +; LIN-REF-NEXT: #NO_APP +; LIN-REF-NEXT: xorl %ecx, %ecx +; LIN-REF-NEXT: xorl %eax, %eax +; LIN-REF-NEXT: callq *%rcx +; LIN-REF-NEXT: addq $24, %rsp +; LIN-REF-NEXT: .cfi_def_cfa_offset 56 +; LIN-REF-NEXT: popq %rbx +; LIN-REF-NEXT: .cfi_def_cfa_offset 48 +; LIN-REF-NEXT: popq %r12 +; LIN-REF-NEXT: .cfi_def_cfa_offset 40 +; LIN-REF-NEXT: popq %r13 +; LIN-REF-NEXT: .cfi_def_cfa_offset 32 +; LIN-REF-NEXT: popq %r14 +; LIN-REF-NEXT: .cfi_def_cfa_offset 24 +; LIN-REF-NEXT: popq %r15 +; LIN-REF-NEXT: .cfi_def_cfa_offset 16 +; LIN-REF-NEXT: popq %rbp +; LIN-REF-NEXT: .cfi_def_cfa_offset 8 +; LIN-REF-NEXT: retq +; +; LIN-LABEL: csr6_alloc16: +; LIN: # %bb.0: # %entry +; LIN-NEXT: pushq %rax +; LIN-NEXT: .cfi_def_cfa_offset 16 +; LIN-NEXT: push2 %r15, %rbp +; LIN-NEXT: .cfi_def_cfa_offset 32 +; LIN-NEXT: push2 %r13, %r14 +; LIN-NEXT: .cfi_def_cfa_offset 48 +; LIN-NEXT: push2 %rbx, %r12 +; LIN-NEXT: .cfi_def_cfa_offset 64 +; LIN-NEXT: subq $32, %rsp +; LIN-NEXT: .cfi_def_cfa_offset 96 +; LIN-NEXT: .cfi_offset %rbx, -64 +; LIN-NEXT: .cfi_offset %r12, -56 +; LIN-NEXT: .cfi_offset %r13, -48 +; LIN-NEXT: .cfi_offset %r14, -40 +; LIN-NEXT: .cfi_offset %r15, -32 +; LIN-NEXT: .cfi_offset %rbp, -24 +; LIN-NEXT: #APP +; LIN-NEXT: #NO_APP +; LIN-NEXT: xorl %ecx, %ecx +; LIN-NEXT: xorl %eax, %eax +; LIN-NEXT: callq *%rcx +; LIN-NEXT: addq $32, %rsp +; LIN-NEXT: .cfi_def_cfa_offset 64 +; LIN-NEXT: pop2 %r12, %rbx +; LIN-NEXT: .cfi_def_cfa_offset 48 +; LIN-NEXT: pop2 %r14, %r13 +; LIN-NEXT: .cfi_def_cfa_offset 32 +; LIN-NEXT: pop2 %rbp, %r15 +; LIN-NEXT: .cfi_def_cfa_offset 16 +; LIN-NEXT: popq %rcx +; LIN-NEXT: .cfi_def_cfa_offset 8 +; LIN-NEXT: retq +; +; LIN-PPX-LABEL: csr6_alloc16: +; LIN-PPX: # %bb.0: # %entry +; LIN-PPX-NEXT: pushq %rax +; LIN-PPX-NEXT: .cfi_def_cfa_offset 16 +; LIN-PPX-NEXT: push2p %r15, %rbp +; LIN-PPX-NEXT: .cfi_def_cfa_offset 32 +; LIN-PPX-NEXT: push2p %r13, %r14 +; LIN-PPX-NEXT: .cfi_def_cfa_offset 48 +; LIN-PPX-NEXT: push2p %rbx, %r12 +; LIN-PPX-NEXT: .cfi_def_cfa_offset 64 +; LIN-PPX-NEXT: subq $32, %rsp +; LIN-PPX-NEXT: .cfi_def_cfa_offset 96 +; LIN-PPX-NEXT: .cfi_offset %rbx, -64 +; LIN-PPX-NEXT: .cfi_offset %r12, -56 +; LIN-PPX-NEXT: .cfi_offset %r13, -48 +; LIN-PPX-NEXT: .cfi_offset %r14, -40 +; LIN-PPX-NEXT: .cfi_offset %r15, -32 +; LIN-PPX-NEXT: .cfi_offset %rbp, -24 +; LIN-PPX-NEXT: #APP +; LIN-PPX-NEXT: #NO_APP +; LIN-PPX-NEXT: xorl %ecx, %ecx +; LIN-PPX-NEXT: xorl %eax, %eax +; LIN-PPX-NEXT: callq *%rcx +; LIN-PPX-NEXT: addq $32, %rsp +; LIN-PPX-NEXT: .cfi_def_cfa_offset 64 +; LIN-PPX-NEXT: pop2p %r12, %rbx +; LIN-PPX-NEXT: .cfi_def_cfa_offset 48 +; LIN-PPX-NEXT: pop2p %r14, %r13 +; LIN-PPX-NEXT: .cfi_def_cfa_offset 32 +; LIN-PPX-NEXT: pop2p %rbp, %r15 +; LIN-PPX-NEXT: .cfi_def_cfa_offset 16 +; LIN-PPX-NEXT: popq %rcx +; LIN-PPX-NEXT: .cfi_def_cfa_offset 8 +; LIN-PPX-NEXT: retq +; +; WIN-REF-LABEL: csr6_alloc16: +; WIN-REF: # %bb.0: # %entry +; WIN-REF-NEXT: pushq %r15 +; WIN-REF-NEXT: .seh_pushreg %r15 +; WIN-REF-NEXT: pushq %r14 +; WIN-REF-NEXT: .seh_pushreg %r14 +; WIN-REF-NEXT: pushq %r13 +; WIN-REF-NEXT: .seh_pushreg %r13 +; WIN-REF-NEXT: pushq %r12 +; WIN-REF-NEXT: .seh_pushreg %r12 +; WIN-REF-NEXT: pushq %rbp +; WIN-REF-NEXT: .seh_pushreg %rbp +; WIN-REF-NEXT: pushq %rbx +; WIN-REF-NEXT: .seh_pushreg %rbx +; WIN-REF-NEXT: subq $56, %rsp +; WIN-REF-NEXT: .seh_stackalloc 56 +; WIN-REF-NEXT: .seh_endprologue +; WIN-REF-NEXT: #APP +; WIN-REF-NEXT: #NO_APP +; WIN-REF-NEXT: xorl %eax, %eax +; WIN-REF-NEXT: callq *%rax +; WIN-REF-NEXT: nop +; WIN-REF-NEXT: addq $56, %rsp +; WIN-REF-NEXT: popq %rbx +; WIN-REF-NEXT: popq %rbp +; WIN-REF-NEXT: popq %r12 +; WIN-REF-NEXT: popq %r13 +; WIN-REF-NEXT: popq %r14 +; WIN-REF-NEXT: popq %r15 +; WIN-REF-NEXT: retq +; WIN-REF-NEXT: .seh_endproc +; +; WIN-LABEL: csr6_alloc16: +; WIN: # %bb.0: # %entry +; WIN-NEXT: pushq %rax +; WIN-NEXT: .seh_pushreg %rax +; WIN-NEXT: push2 %r14, %r15 +; WIN-NEXT: .seh_pushreg %r15 +; WIN-NEXT: .seh_pushreg %r14 +; WIN-NEXT: push2 %r12, %r13 +; WIN-NEXT: .seh_pushreg %r13 +; WIN-NEXT: .seh_pushreg %r12 +; WIN-NEXT: push2 %rbx, %rbp +; WIN-NEXT: .seh_pushreg %rbp +; WIN-NEXT: .seh_pushreg %rbx +; WIN-NEXT: subq $64, %rsp +; WIN-NEXT: .seh_stackalloc 64 +; WIN-NEXT: .seh_endprologue +; WIN-NEXT: #APP +; WIN-NEXT: #NO_APP +; WIN-NEXT: xorl %eax, %eax +; WIN-NEXT: callq *%rax +; WIN-NEXT: nop +; WIN-NEXT: addq $64, %rsp +; WIN-NEXT: pop2 %rbp, %rbx +; WIN-NEXT: pop2 %r13, %r12 +; WIN-NEXT: pop2 %r15, %r14 +; WIN-NEXT: popq %rcx +; WIN-NEXT: retq +; WIN-NEXT: .seh_endproc +; +; WIN-PPX-LABEL: csr6_alloc16: +; WIN-PPX: # %bb.0: # %entry +; WIN-PPX-NEXT: pushq %rax +; WIN-PPX-NEXT: .seh_pushreg %rax +; WIN-PPX-NEXT: push2p %r14, %r15 +; WIN-PPX-NEXT: .seh_pushreg %r15 +; WIN-PPX-NEXT: .seh_pushreg %r14 +; WIN-PPX-NEXT: push2p %r12, %r13 +; WIN-PPX-NEXT: .seh_pushreg %r13 +; WIN-PPX-NEXT: .seh_pushreg %r12 +; WIN-PPX-NEXT: push2p %rbx, %rbp +; WIN-PPX-NEXT: .seh_pushreg %rbp +; WIN-PPX-NEXT: .seh_pushreg %rbx +; WIN-PPX-NEXT: subq $64, %rsp +; WIN-PPX-NEXT: .seh_stackalloc 64 +; WIN-PPX-NEXT: .seh_endprologue +; WIN-PPX-NEXT: #APP +; WIN-PPX-NEXT: #NO_APP +; WIN-PPX-NEXT: xorl %eax, %eax +; WIN-PPX-NEXT: callq *%rax +; WIN-PPX-NEXT: nop +; WIN-PPX-NEXT: addq $64, %rsp +; WIN-PPX-NEXT: pop2p %rbp, %rbx +; WIN-PPX-NEXT: pop2p %r13, %r12 +; WIN-PPX-NEXT: pop2p %r15, %r14 +; WIN-PPX-NEXT: popq %rcx +; WIN-PPX-NEXT: retq +; WIN-PPX-NEXT: .seh_endproc +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"() + %a = alloca [3 x ptr], align 8 + %b = call ptr (...) null() + ret i32 undef +} diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll new file mode 100644 index 0000000000000..aa5c54d30e3bc --- /dev/null +++ b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check PUSH2/POP2 is not used for vector registers +; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+push2pop2 -frame-pointer=all | FileCheck %s --check-prefix=FRAME + +define void @widget(float %arg) nounwind { +; CHECK-LABEL: widget: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: push2 %rbp, %rsi +; CHECK-NEXT: subq $48, %rsp +; CHECK-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm6 +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: callq *%rsi +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %r8d, %r8d +; CHECK-NEXT: callq *%rsi +; CHECK-NEXT: movss %xmm6, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; CHECK-NEXT: addq $48, %rsp +; CHECK-NEXT: pop2 %rsi, %rbp +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: retq +; +; FRAME-LABEL: widget: +; FRAME: # %bb.0: # %bb +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: push2 %rsi, %r15 +; FRAME-NEXT: subq $48, %rsp +; FRAME-NEXT: leaq {{[0-9]+}}(%rsp), %rbp +; FRAME-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FRAME-NEXT: movaps %xmm0, %xmm6 +; FRAME-NEXT: xorl %esi, %esi +; FRAME-NEXT: xorl %ecx, %ecx +; FRAME-NEXT: callq *%rsi +; FRAME-NEXT: xorl %ecx, %ecx +; FRAME-NEXT: xorl %edx, %edx +; FRAME-NEXT: xorl %r8d, %r8d +; FRAME-NEXT: callq *%rsi +; FRAME-NEXT: movss %xmm6, 0 +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; FRAME-NEXT: addq $48, %rsp +; FRAME-NEXT: pop2 %r15, %rsi +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: retq +bb: + %call = tail call float null(ptr null) + %call1 = tail call i32 null(ptr null, i32 0, i32 0) + store float %arg, ptr null, align 4 + tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"() + ret void +} diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2.ll b/llvm/test/CodeGen/X86/apx/push2-pop2.ll new file mode 100644 index 0000000000000..25139f1da8272 --- /dev/null +++ b/llvm/test/CodeGen/X86/apx/push2-pop2.ll @@ -0,0 +1,432 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2,+ppx | FileCheck %s --check-prefix=PPX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 -frame-pointer=all | FileCheck %s --check-prefix=FRAME + +define void @csr1() nounwind { +; CHECK-LABEL: csr1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +; +; PPX-LABEL: csr1: +; PPX: # %bb.0: # %entry +; PPX-NEXT: pushp %rbp +; PPX-NEXT: #APP +; PPX-NEXT: #NO_APP +; PPX-NEXT: popp %rbp +; PPX-NEXT: retq +; +; FRAME-LABEL: csr1: +; FRAME: # %bb.0: # %entry +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr2() nounwind { +; CHECK-LABEL: csr2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +; +; PPX-LABEL: csr2: +; PPX: # %bb.0: # %entry +; PPX-NEXT: pushp %rbp +; PPX-NEXT: pushp %r15 +; PPX-NEXT: #APP +; PPX-NEXT: #NO_APP +; PPX-NEXT: popp %r15 +; PPX-NEXT: popp %rbp +; PPX-NEXT: retq +; +; FRAME-LABEL: csr2: +; FRAME: # %bb.0: # %entry +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: pushq %r15 +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %r15 +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr3() nounwind { +; CHECK-LABEL: csr3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: push2 %r14, %r15 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: pop2 %r15, %r14 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +; +; PPX-LABEL: csr3: +; PPX: # %bb.0: # %entry +; PPX-NEXT: pushp %rbp +; PPX-NEXT: push2p %r14, %r15 +; PPX-NEXT: #APP +; PPX-NEXT: #NO_APP +; PPX-NEXT: pop2p %r15, %r14 +; PPX-NEXT: popp %rbp +; PPX-NEXT: retq +; +; FRAME-LABEL: csr3: +; FRAME: # %bb.0: # %entry +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: push2 %r14, %r15 +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: pop2 %r15, %r14 +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr4() nounwind { +; CHECK-LABEL: csr4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: push2 %r15, %rbp +; CHECK-NEXT: push2 %r13, %r14 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: pop2 %r14, %r13 +; CHECK-NEXT: pop2 %rbp, %r15 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq +; +; PPX-LABEL: csr4: +; PPX: # %bb.0: # %entry +; PPX-NEXT: pushq %rax +; PPX-NEXT: push2p %r15, %rbp +; PPX-NEXT: push2p %r13, %r14 +; PPX-NEXT: #APP +; PPX-NEXT: #NO_APP +; PPX-NEXT: pop2p %r14, %r13 +; PPX-NEXT: pop2p %rbp, %r15 +; PPX-NEXT: popq %rax +; PPX-NEXT: retq +; +; FRAME-LABEL: csr4: +; FRAME: # %bb.0: # %entry +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: push2 %r14, %r15 +; FRAME-NEXT: pushq %r13 +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %r13 +; FRAME-NEXT: pop2 %r15, %r14 +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr5() nounwind { +; CHECK-LABEL: csr5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: push2 %r14, %r15 +; CHECK-NEXT: push2 %r12, %r13 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: pop2 %r13, %r12 +; CHECK-NEXT: pop2 %r15, %r14 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq +; +; PPX-LABEL: csr5: +; PPX: # %bb.0: # %entry +; PPX-NEXT: pushp %rbp +; PPX-NEXT: push2p %r14, %r15 +; PPX-NEXT: push2p %r12, %r13 +; PPX-NEXT: #APP +; PPX-NEXT: #NO_APP +; PPX-NEXT: pop2p %r13, %r12 +; PPX-NEXT: pop2p %r15, %r14 +; PPX-NEXT: popp %rbp +; PPX-NEXT: retq +; +; FRAME-LABEL: csr5: +; FRAME: # %bb.0: # %entry +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: push2 %r14, %r15 +; FRAME-NEXT: push2 %r12, %r13 +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: pop2 %r13, %r12 +; FRAME-NEXT: pop2 %r15, %r14 +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +define void @csr6() nounwind { +; CHECK-LABEL: csr6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: push2 %r15, %rbp +; CHECK-NEXT: push2 %r13, %r14 +; CHECK-NEXT: push2 %rbx, %r12 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: pop2 %r12, %rbx +; CHECK-NEXT: pop2 %r14, %r13 +; CHECK-NEXT: pop2 %rbp, %r15 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq +; +; PPX-LABEL: csr6: +; PPX: # %bb.0: # %entry +; PPX-NEXT: pushq %rax +; PPX-NEXT: push2p %r15, %rbp +; PPX-NEXT: push2p %r13, %r14 +; PPX-NEXT: push2p %rbx, %r12 +; PPX-NEXT: #APP +; PPX-NEXT: #NO_APP +; PPX-NEXT: pop2p %r12, %rbx +; PPX-NEXT: pop2p %r14, %r13 +; PPX-NEXT: pop2p %rbp, %r15 +; PPX-NEXT: popq %rax +; PPX-NEXT: retq +; +; FRAME-LABEL: csr6: +; FRAME: # %bb.0: # %entry +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: push2 %r14, %r15 +; FRAME-NEXT: push2 %r12, %r13 +; FRAME-NEXT: pushq %rbx +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rbx +; FRAME-NEXT: pop2 %r13, %r12 +; FRAME-NEXT: pop2 %r15, %r14 +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"() + ret void +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) + +define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10) nounwind { +; CHECK-LABEL: lea_in_epilog: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB6_5 +; CHECK-NEXT: # %bb.1: # %bb13 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: push2 %r15, %rbp +; CHECK-NEXT: push2 %r13, %r14 +; CHECK-NEXT: push2 %rbx, %r12 +; CHECK-NEXT: subq $16, %rsp +; CHECK-NEXT: movq %r9, %r14 +; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r14 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; CHECK-NEXT: addq %r14, %r13 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: addq %r14, %r15 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK-NEXT: addq %r14, %rbx +; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: xorl %r12d, %r12d +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB6_2: # %bb15 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: incq %r12 +; CHECK-NEXT: movl $432, %edx # imm = 0x1B0 +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movq %r15, %rsi +; CHECK-NEXT: callq memcpy@PLT +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: addq %rax, %r13 +; CHECK-NEXT: addq %rax, %r15 +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r14 +; CHECK-NEXT: addq $8, %rbp +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB6_2 +; CHECK-NEXT: # %bb.3: # %bb11 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; CHECK-NEXT: pop2 %r12, %rbx +; CHECK-NEXT: pop2 %r14, %r13 +; CHECK-NEXT: pop2 %rbp, %r15 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; CHECK-NEXT: jne .LBB6_5 +; CHECK-NEXT: # %bb.4: # %bb12 +; CHECK-NEXT: movq $0, (%rax) +; CHECK-NEXT: .LBB6_5: # %bb14 +; CHECK-NEXT: retq +; +; PPX-LABEL: lea_in_epilog: +; PPX: # %bb.0: # %bb +; PPX-NEXT: testb $1, %dil +; PPX-NEXT: je .LBB6_5 +; PPX-NEXT: # %bb.1: # %bb13 +; PPX-NEXT: pushq %rax +; PPX-NEXT: push2p %r15, %rbp +; PPX-NEXT: push2p %r13, %r14 +; PPX-NEXT: push2p %rbx, %r12 +; PPX-NEXT: subq $16, %rsp +; PPX-NEXT: movq %r9, %r14 +; PPX-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; PPX-NEXT: addq {{[0-9]+}}(%rsp), %r14 +; PPX-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; PPX-NEXT: addq %r14, %r13 +; PPX-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; PPX-NEXT: addq %r14, %r15 +; PPX-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; PPX-NEXT: addq %r14, %rbx +; PPX-NEXT: xorl %ebp, %ebp +; PPX-NEXT: xorl %r12d, %r12d +; PPX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; PPX-NEXT: .p2align 4, 0x90 +; PPX-NEXT: .LBB6_2: # %bb15 +; PPX-NEXT: # =>This Inner Loop Header: Depth=1 +; PPX-NEXT: incq %r12 +; PPX-NEXT: movl $432, %edx # imm = 0x1B0 +; PPX-NEXT: xorl %edi, %edi +; PPX-NEXT: movq %r15, %rsi +; PPX-NEXT: callq memcpy@PLT +; PPX-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; PPX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; PPX-NEXT: addq %rax, %r13 +; PPX-NEXT: addq %rax, %r15 +; PPX-NEXT: addq %rax, %rbx +; PPX-NEXT: addq %rax, %r14 +; PPX-NEXT: addq $8, %rbp +; PPX-NEXT: testb $1, %dil +; PPX-NEXT: je .LBB6_2 +; PPX-NEXT: # %bb.3: # %bb11 +; PPX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; PPX-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; PPX-NEXT: pop2p %r12, %rbx +; PPX-NEXT: pop2p %r14, %r13 +; PPX-NEXT: pop2p %rbp, %r15 +; PPX-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; PPX-NEXT: jne .LBB6_5 +; PPX-NEXT: # %bb.4: # %bb12 +; PPX-NEXT: movq $0, (%rax) +; PPX-NEXT: .LBB6_5: # %bb14 +; PPX-NEXT: retq +; +; FRAME-LABEL: lea_in_epilog: +; FRAME: # %bb.0: # %bb +; FRAME-NEXT: testb $1, %dil +; FRAME-NEXT: je .LBB6_5 +; FRAME-NEXT: # %bb.1: # %bb13 +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: push2 %r14, %r15 +; FRAME-NEXT: push2 %r12, %r13 +; FRAME-NEXT: pushq %rbx +; FRAME-NEXT: subq $24, %rsp +; FRAME-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FRAME-NEXT: addq 16(%rbp), %r9 +; FRAME-NEXT: movq 48(%rbp), %rbx +; FRAME-NEXT: addq %r9, %rbx +; FRAME-NEXT: movq 40(%rbp), %r12 +; FRAME-NEXT: addq %r9, %r12 +; FRAME-NEXT: movq 32(%rbp), %r15 +; FRAME-NEXT: addq %r9, %r15 +; FRAME-NEXT: xorl %r13d, %r13d +; FRAME-NEXT: xorl %r14d, %r14d +; FRAME-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FRAME-NEXT: .p2align 4, 0x90 +; FRAME-NEXT: .LBB6_2: # %bb15 +; FRAME-NEXT: # =>This Inner Loop Header: Depth=1 +; FRAME-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FRAME-NEXT: incq %r14 +; FRAME-NEXT: movl $432, %edx # imm = 0x1B0 +; FRAME-NEXT: xorl %edi, %edi +; FRAME-NEXT: movq %r12, %rsi +; FRAME-NEXT: callq memcpy@PLT +; FRAME-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; FRAME-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; FRAME-NEXT: movq 16(%rbp), %rax +; FRAME-NEXT: addq %rax, %rbx +; FRAME-NEXT: addq %rax, %r12 +; FRAME-NEXT: addq %rax, %r15 +; FRAME-NEXT: addq %rax, %r9 +; FRAME-NEXT: addq $8, %r13 +; FRAME-NEXT: testb $1, %dil +; FRAME-NEXT: je .LBB6_2 +; FRAME-NEXT: # %bb.3: # %bb11 +; FRAME-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; FRAME-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; FRAME-NEXT: popq %rbx +; FRAME-NEXT: pop2 %r13, %r12 +; FRAME-NEXT: pop2 %r15, %r14 +; FRAME-NEXT: popq %rbp +; FRAME-NEXT: jne .LBB6_5 +; FRAME-NEXT: # %bb.4: # %bb12 +; FRAME-NEXT: movq $0, (%rax) +; FRAME-NEXT: .LBB6_5: # %bb14 +; FRAME-NEXT: retq +bb: + br i1 %arg, label %bb13, label %bb14 + +bb11: + br i1 %arg, label %bb14, label %bb12 + +bb12: + store double 0.000000e+00, ptr %arg1, align 8 + br label %bb14 + +bb13: + %getelementptr = getelementptr i8, ptr null, i64 %arg5 + br label %bb15 + +bb14: + ret void + +bb15: + %phi = phi i64 [ 0, %bb13 ], [ %add, %bb15 ] + %getelementptr16 = getelementptr double, ptr null, i64 %phi + %add = add i64 %phi, 1 + %mul = mul i64 %arg6, %add + %getelementptr17 = getelementptr i8, ptr %getelementptr, i64 %mul + call void @llvm.memcpy.p0.p0.i64(ptr %getelementptr16, ptr %getelementptr17, i64 0, i1 false) + %getelementptr18 = getelementptr i8, ptr %getelementptr17, i64 %arg7 + %getelementptr19 = getelementptr i8, ptr %getelementptr17, i64 %arg8 + call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr19, i64 0, i1 false) + %getelementptr20 = getelementptr i8, ptr %getelementptr17, i64 %arg9 + call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr20, i64 432, i1 false) + %getelementptr21 = getelementptr i8, ptr %getelementptr17, i64 %arg10 + call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %getelementptr21, i64 0, i1 false) + br i1 %arg, label %bb11, label %bb15 +} diff --git a/llvm/test/CodeGen/X86/apx/pushp-popp.ll b/llvm/test/CodeGen/X86/apx/pushp-popp.ll new file mode 100644 index 0000000000000..ad4306fccce66 --- /dev/null +++ b/llvm/test/CodeGen/X86/apx/pushp-popp.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ppx | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ppx -frame-pointer=all | FileCheck %s --check-prefix=FRAME + +define void @csr2() nounwind { +; CHECK-LABEL: csr2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushp %rbp +; CHECK-NEXT: pushp %r15 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popp %r15 +; CHECK-NEXT: popp %rbp +; CHECK-NEXT: retq +; +; FRAME-LABEL: csr2: +; FRAME: # %bb.0: # %entry +; FRAME-NEXT: pushp %rbp +; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: pushp %r15 +; FRAME-NEXT: #APP +; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popp %r15 +; FRAME-NEXT: popp %rbp +; FRAME-NEXT: retq +entry: + tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"() + ret void +}