@@ -140,6 +140,11 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
140140 cl::desc (" enable use of redzone on AArch64" ),
141141 cl::init(false ), cl::Hidden);
142142
143+ static cl::opt<bool >
144+ ReverseCSRRestoreSeq (" reverse-csr-restore-seq" ,
145+ cl::desc (" reverse the CSR restore sequence" ),
146+ cl::init(false ), cl::Hidden);
147+
143148STATISTIC (NumRedZoneFunctions, " Number of functions using red zone" );
144149
145150// / This is the biggest offset to the stack pointer we can encode in aarch64
@@ -843,14 +848,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
843848 Subtarget.isCallingConvWin64 (MF.getFunction ().getCallingConv ());
844849 unsigned FixedObject = IsWin64 ? alignTo (AFI->getVarArgsGPRSize (), 16 ) : 0 ;
845850
851+ uint64_t AfterCSRPopSize = ArgumentPopSize;
846852 auto PrologueSaveSize = AFI->getCalleeSavedStackSize () + FixedObject;
847853 bool CombineSPBump = shouldCombineCSRLocalStackBump (MF, NumBytes);
848-
849- if (!CombineSPBump && PrologueSaveSize != 0 )
850- convertCalleeSaveRestoreToSPPrePostIncDec (
851- MBB, std::prev (MBB.getFirstTerminator ()), DL, TII, PrologueSaveSize);
854+ // Assume we can't combine the last pop with the sp restore.
855+
856+ if (!CombineSPBump && PrologueSaveSize != 0 ) {
857+ MachineBasicBlock::iterator Pop = std::prev (MBB.getFirstTerminator ());
858+ // Converting the last ldp to a post-index ldp is valid only if the last
859+ // ldp's offset is 0.
860+ const MachineOperand &OffsetOp = Pop->getOperand (Pop->getNumOperands () - 1 );
861+ // If the offset is 0, convert it to a post-index ldp.
862+ if (OffsetOp.getImm () == 0 ) {
863+ convertCalleeSaveRestoreToSPPrePostIncDec (MBB, Pop, DL, TII,
864+ PrologueSaveSize);
865+ } else {
866+ // If not, make sure to emit an add after the last ldp.
867+ // We're doing this by transfering the size to be restored from the
868+ // adjustment *before* the CSR pops to the adjustment *after* the CSR
869+ // pops.
870+ AfterCSRPopSize += PrologueSaveSize;
871+ }
872+ }
852873
853874 // Move past the restores of the callee-saved registers.
875+ // If we plan on combining the sp bump of the local stack size and the callee
876+ // save stack size, we might need to adjust the CSR save and restore offsets.
854877 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator ();
855878 MachineBasicBlock::iterator Begin = MBB.begin ();
856879 while (LastPopI != Begin) {
@@ -865,7 +888,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
865888 // If there is a single SP update, insert it before the ret and we're done.
866889 if (CombineSPBump) {
867890 emitFrameOffset (MBB, MBB.getFirstTerminator (), DL, AArch64::SP, AArch64::SP,
868- NumBytes + ArgumentPopSize , TII,
891+ NumBytes + AfterCSRPopSize , TII,
869892 MachineInstr::FrameDestroy);
870893 return ;
871894 }
@@ -877,18 +900,18 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
877900 bool RedZone = canUseRedZone (MF);
878901 // If this was a redzone leaf function, we don't need to restore the
879902 // stack pointer (but we may need to pop stack args for fastcc).
880- if (RedZone && ArgumentPopSize == 0 )
903+ if (RedZone && AfterCSRPopSize == 0 )
881904 return ;
882905
883906 bool NoCalleeSaveRestore = PrologueSaveSize == 0 ;
884907 int StackRestoreBytes = RedZone ? 0 : NumBytes;
885908 if (NoCalleeSaveRestore)
886- StackRestoreBytes += ArgumentPopSize ;
909+ StackRestoreBytes += AfterCSRPopSize ;
887910 emitFrameOffset (MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
888911 StackRestoreBytes, TII, MachineInstr::FrameDestroy);
889912 // If we were able to combine the local stack pop with the argument pop,
890913 // then we're done.
891- if (NoCalleeSaveRestore || ArgumentPopSize == 0 )
914+ if (NoCalleeSaveRestore || AfterCSRPopSize == 0 )
892915 return ;
893916 NumBytes = 0 ;
894917 }
@@ -908,9 +931,37 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
908931 // This must be placed after the callee-save restore code because that code
909932 // assumes the SP is at the same location as it was after the callee-save save
910933 // code in the prologue.
911- if (ArgumentPopSize)
934+ if (AfterCSRPopSize) {
935+ // Sometimes (when we restore in the same order as we save), we can end up
936+ // with code like this:
937+ //
938+ // ldp x26, x25, [sp]
939+ // ldp x24, x23, [sp, #16]
940+ // ldp x22, x21, [sp, #32]
941+ // ldp x20, x19, [sp, #48]
942+ // add sp, sp, #64
943+ //
944+ // In this case, it is always better to put the first ldp at the end, so
945+ // that the load-store optimizer can run and merge the ldp and the add into
946+ // a post-index ldp.
947+ // If we managed to grab the first pop instruction, move it to the end.
948+ if (LastPopI != Begin)
949+ MBB.splice (MBB.getFirstTerminator (), &MBB, LastPopI);
950+ // We should end up with something like this now:
951+ //
952+ // ldp x24, x23, [sp, #16]
953+ // ldp x22, x21, [sp, #32]
954+ // ldp x20, x19, [sp, #48]
955+ // ldp x26, x25, [sp]
956+ // add sp, sp, #64
957+ //
958+ // and the load-store optimizer can merge the last two instructions into:
959+ //
960+ // ldp x26, x25, [sp], #64
961+ //
912962 emitFrameOffset (MBB, MBB.getFirstTerminator (), DL, AArch64::SP, AArch64::SP,
913- ArgumentPopSize, TII, MachineInstr::FrameDestroy);
963+ AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
964+ }
914965}
915966
916967// / getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1179,9 +1230,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
11791230
11801231 computeCalleeSaveRegisterPairs (MF, CSI, TRI, RegPairs);
11811232
1182- for (auto RPII = RegPairs.begin (), RPIE = RegPairs.end (); RPII != RPIE;
1183- ++RPII) {
1184- RegPairInfo RPI = *RPII;
1233+ auto EmitMI = [&](const RegPairInfo &RPI) {
11851234 unsigned Reg1 = RPI.Reg1 ;
11861235 unsigned Reg2 = RPI.Reg2 ;
11871236
@@ -1220,7 +1269,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
12201269 MIB.addMemOperand (MF.getMachineMemOperand (
12211270 MachinePointerInfo::getFixedStack (MF, RPI.FrameIdx ),
12221271 MachineMemOperand::MOLoad, 8 , 8 ));
1223- }
1272+ };
1273+
1274+ if (ReverseCSRRestoreSeq)
1275+ for (const RegPairInfo &RPI : reverse (RegPairs))
1276+ EmitMI (RPI);
1277+ else
1278+ for (const RegPairInfo &RPI : RegPairs)
1279+ EmitMI (RPI);
12241280 return true ;
12251281}
12261282
0 commit comments