Skip to content

Commit 27a549f

Browse files
committed
[AArch64][SME] Simplify initialization of TPIDR2 block
This patch updates the definition of `AArch64ISD::INIT_TPIDR2OBJ` to take the number of save slices (which is currently always all ZA slices). Using this, we can initialize the TPIDR2 block with a single STP of the save buffer pointer and the number of save slices. The reserved bytes (10-15) will be implicitly zeroed as the result of RDSVL will always be <= 16-bits. Using an STP is also possible for big-endian targets with an additional left shift. Note: We used to write the number of save slices to the TPIDR2 block before every call with a lazy save; however, based on 6.6.9 "Changes to the TPIDR2 block" in the aapcs64 (https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#changes-to-the-tpidr2-block), it seems we can rely on callers preserving the contents of the TPIDR2 block.
1 parent 1a08aa2 commit 27a549f

File tree

10 files changed

+150
-142
lines changed

10 files changed

+150
-142
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3231,20 +3231,24 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
32313231
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
32323232
if (TPIDR2.Uses > 0) {
32333233
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3234-
// Store the buffer pointer to the TPIDR2 stack object.
3235-
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3234+
unsigned TPIDInitSaveSlicesReg = MI.getOperand(1).getReg();
3235+
if (!Subtarget->isLittleEndian()) {
3236+
unsigned TmpReg =
3237+
MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
3238+
// For big-endian targets move "num_za_save_slices" to the top two bytes.
3239+
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::UBFMXri), TmpReg)
3240+
.addReg(TPIDInitSaveSlicesReg)
3241+
.addImm(16)
3242+
.addImm(15);
3243+
TPIDInitSaveSlicesReg = TmpReg;
3244+
}
3245+
// Store buffer pointer and num_za_save_slices.
3246+
// Bytes 10-15 are implicitly zeroed.
3247+
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
32363248
.addReg(MI.getOperand(0).getReg())
3249+
.addReg(TPIDInitSaveSlicesReg)
32373250
.addFrameIndex(TPIDR2.FrameIndex)
32383251
.addImm(0);
3239-
// Set the reserved bytes (10-15) to zero
3240-
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3241-
.addReg(AArch64::WZR)
3242-
.addFrameIndex(TPIDR2.FrameIndex)
3243-
.addImm(5);
3244-
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3245-
.addReg(AArch64::WZR)
3246-
.addFrameIndex(TPIDR2.FrameIndex)
3247-
.addImm(3);
32483252
} else
32493253
MFI.RemoveStackObject(TPIDR2.FrameIndex);
32503254

@@ -8348,9 +8352,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
83488352
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
83498353
MFI.CreateVariableSizedObject(Align(16), nullptr);
83508354
}
8355+
SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8356+
DAG.getConstant(1, DL, MVT::i32));
83518357
Chain = DAG.getNode(
83528358
AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8353-
{/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
8359+
{/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8360+
/*Num save slices*/ NumZaSaveSlices});
83548361
} else if (SMEAttrs(MF.getFunction()).hasAgnosticZAInterface()) {
83558362
// Call __arm_sme_state_size().
83568363
SDValue BufferSize =
@@ -9131,19 +9138,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
91319138
bool RequiresLazySave = CallAttrs.requiresLazySave();
91329139
bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
91339140
if (RequiresLazySave) {
9134-
const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9135-
MachinePointerInfo MPI =
9136-
MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
9141+
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
91379142
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
91389143
TPIDR2.FrameIndex,
91399144
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
9140-
SDValue NumZaSaveSlicesAddr =
9141-
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
9142-
DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
9143-
SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9144-
DAG.getConstant(1, DL, MVT::i32));
9145-
Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
9146-
MPI, MVT::i16);
91479145
Chain = DAG.getNode(
91489146
ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
91499147
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
5454
def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)),
5555
(AllocateZABuffer $size)>;
5656

57-
def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1,
58-
[SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>;
57+
def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 2,
58+
[SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPMayStore]>;
5959
let usesCustomInserter = 1 in {
60-
def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {}
60+
def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer, GPR64:$save_slices), [(AArch64InitTPIDR2Obj GPR64:$buffer, GPR64:$save_slices)]>, Sched<[WriteI]> {}
6161
}
6262

6363
// Nodes to allocate a save buffer for SME.

llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,7 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
268268
; CHECK-COMMON-NEXT: mov x9, sp
269269
; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
270270
; CHECK-COMMON-NEXT: mov sp, x9
271-
; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
272-
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
273-
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
274-
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
271+
; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
275272
; CHECK-COMMON-NEXT: sub x8, x29, #16
276273
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
277274
; CHECK-COMMON-NEXT: bl normal_callee
@@ -310,12 +307,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
310307
; CHECK-COMMON-NEXT: mov x9, sp
311308
; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
312309
; CHECK-COMMON-NEXT: mov sp, x9
313-
; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
314-
; CHECK-COMMON-NEXT: sub x9, x29, #16
315-
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
316-
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
317-
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
318-
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
310+
; CHECK-COMMON-NEXT: sub x10, x29, #16
311+
; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
312+
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
319313
; CHECK-COMMON-NEXT: bl __addtf3
320314
; CHECK-COMMON-NEXT: smstart za
321315
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
@@ -375,12 +369,9 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
375369
; CHECK-COMMON-NEXT: mov x9, sp
376370
; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
377371
; CHECK-COMMON-NEXT: mov sp, x9
378-
; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
379-
; CHECK-COMMON-NEXT: sub x9, x29, #16
380-
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
381-
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
382-
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
383-
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
372+
; CHECK-COMMON-NEXT: sub x10, x29, #16
373+
; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
374+
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
384375
; CHECK-COMMON-NEXT: bl fmod
385376
; CHECK-COMMON-NEXT: smstart za
386377
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0

llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll

Lines changed: 21 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,9 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
1616
; CHECK-NEXT: mov x9, sp
1717
; CHECK-NEXT: msub x9, x8, x8, x9
1818
; CHECK-NEXT: mov sp, x9
19-
; CHECK-NEXT: stur x9, [x29, #-16]
20-
; CHECK-NEXT: sub x9, x29, #16
21-
; CHECK-NEXT: sturh wzr, [x29, #-6]
22-
; CHECK-NEXT: stur wzr, [x29, #-4]
23-
; CHECK-NEXT: sturh w8, [x29, #-8]
24-
; CHECK-NEXT: msr TPIDR2_EL0, x9
19+
; CHECK-NEXT: sub x10, x29, #16
20+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
21+
; CHECK-NEXT: msr TPIDR2_EL0, x10
2522
; CHECK-NEXT: bl private_za_callee
2623
; CHECK-NEXT: smstart za
2724
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -43,21 +40,17 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
4340
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
4441
; CHECK-LABEL: test_lazy_save_2_callees:
4542
; CHECK: // %bb.0:
46-
; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
47-
; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill
43+
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
44+
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
4845
; CHECK-NEXT: mov x29, sp
49-
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
5046
; CHECK-NEXT: sub sp, sp, #16
51-
; CHECK-NEXT: rdsvl x20, #1
52-
; CHECK-NEXT: mov x8, sp
53-
; CHECK-NEXT: msub x8, x20, x20, x8
54-
; CHECK-NEXT: mov sp, x8
55-
; CHECK-NEXT: sub x21, x29, #16
56-
; CHECK-NEXT: stur x8, [x29, #-16]
57-
; CHECK-NEXT: sturh wzr, [x29, #-6]
58-
; CHECK-NEXT: stur wzr, [x29, #-4]
59-
; CHECK-NEXT: sturh w20, [x29, #-8]
60-
; CHECK-NEXT: msr TPIDR2_EL0, x21
47+
; CHECK-NEXT: rdsvl x8, #1
48+
; CHECK-NEXT: mov x9, sp
49+
; CHECK-NEXT: msub x9, x8, x8, x9
50+
; CHECK-NEXT: mov sp, x9
51+
; CHECK-NEXT: sub x20, x29, #16
52+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
53+
; CHECK-NEXT: msr TPIDR2_EL0, x20
6154
; CHECK-NEXT: bl private_za_callee
6255
; CHECK-NEXT: smstart za
6356
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -67,8 +60,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
6760
; CHECK-NEXT: bl __arm_tpidr2_restore
6861
; CHECK-NEXT: .LBB1_2:
6962
; CHECK-NEXT: msr TPIDR2_EL0, xzr
70-
; CHECK-NEXT: sturh w20, [x29, #-8]
71-
; CHECK-NEXT: msr TPIDR2_EL0, x21
63+
; CHECK-NEXT: msr TPIDR2_EL0, x20
7264
; CHECK-NEXT: bl private_za_callee
7365
; CHECK-NEXT: smstart za
7466
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -79,9 +71,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
7971
; CHECK-NEXT: .LBB1_4:
8072
; CHECK-NEXT: msr TPIDR2_EL0, xzr
8173
; CHECK-NEXT: mov sp, x29
82-
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
83-
; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload
84-
; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
74+
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
75+
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
8576
; CHECK-NEXT: ret
8677
call void @private_za_callee()
8778
call void @private_za_callee()
@@ -100,12 +91,9 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
10091
; CHECK-NEXT: mov x9, sp
10192
; CHECK-NEXT: msub x9, x8, x8, x9
10293
; CHECK-NEXT: mov sp, x9
103-
; CHECK-NEXT: stur x9, [x29, #-16]
104-
; CHECK-NEXT: sub x9, x29, #16
105-
; CHECK-NEXT: sturh wzr, [x29, #-6]
106-
; CHECK-NEXT: stur wzr, [x29, #-4]
107-
; CHECK-NEXT: sturh w8, [x29, #-8]
108-
; CHECK-NEXT: msr TPIDR2_EL0, x9
94+
; CHECK-NEXT: sub x10, x29, #16
95+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
96+
; CHECK-NEXT: msr TPIDR2_EL0, x10
10997
; CHECK-NEXT: bl cosf
11098
; CHECK-NEXT: smstart za
11199
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -141,12 +129,9 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
141129
; CHECK-NEXT: mov x9, sp
142130
; CHECK-NEXT: msub x9, x8, x8, x9
143131
; CHECK-NEXT: mov sp, x9
144-
; CHECK-NEXT: stur x9, [x29, #-80]
145-
; CHECK-NEXT: sub x9, x29, #80
146-
; CHECK-NEXT: sturh wzr, [x29, #-70]
147-
; CHECK-NEXT: stur wzr, [x29, #-68]
148-
; CHECK-NEXT: sturh w8, [x29, #-72]
149-
; CHECK-NEXT: msr TPIDR2_EL0, x9
132+
; CHECK-NEXT: sub x10, x29, #80
133+
; CHECK-NEXT: stp x9, x8, [x29, #-80]
134+
; CHECK-NEXT: msr TPIDR2_EL0, x10
150135
; CHECK-NEXT: bl __arm_sme_state
151136
; CHECK-NEXT: and x20, x0, #0x1
152137
; CHECK-NEXT: tbz w20, #0, .LBB3_2

llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,9 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
1414
; CHECK-NEXT: mov x9, sp
1515
; CHECK-NEXT: msub x9, x8, x8, x9
1616
; CHECK-NEXT: mov sp, x9
17-
; CHECK-NEXT: stur x9, [x29, #-16]
18-
; CHECK-NEXT: sub x9, x29, #16
19-
; CHECK-NEXT: sturh wzr, [x29, #-6]
20-
; CHECK-NEXT: stur wzr, [x29, #-4]
21-
; CHECK-NEXT: sturh w8, [x29, #-8]
22-
; CHECK-NEXT: msr TPIDR2_EL0, x9
17+
; CHECK-NEXT: sub x10, x29, #16
18+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
19+
; CHECK-NEXT: msr TPIDR2_EL0, x10
2320
; CHECK-NEXT: bl private_za_callee
2421
; CHECK-NEXT: smstart za
2522
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -47,12 +44,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
4744
; CHECK-NEXT: mov x9, sp
4845
; CHECK-NEXT: msub x9, x8, x8, x9
4946
; CHECK-NEXT: mov sp, x9
50-
; CHECK-NEXT: stur x9, [x29, #-16]
51-
; CHECK-NEXT: sub x9, x29, #16
52-
; CHECK-NEXT: sturh wzr, [x29, #-6]
53-
; CHECK-NEXT: stur wzr, [x29, #-4]
54-
; CHECK-NEXT: sturh w8, [x29, #-8]
55-
; CHECK-NEXT: msr TPIDR2_EL0, x9
47+
; CHECK-NEXT: sub x10, x29, #16
48+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
49+
; CHECK-NEXT: msr TPIDR2_EL0, x10
5650
; CHECK-NEXT: bl __addtf3
5751
; CHECK-NEXT: smstart za
5852
; CHECK-NEXT: mrs x8, TPIDR2_EL0
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64_be -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme < %s | FileCheck %s --check-prefix=CHECK-BE
4+
5+
declare void @private_za_callee()
6+
declare float @llvm.cos.f32(float)
7+
8+
; Test TPIDR2_EL0 is initialized correctly for AArch64 big-endian.
9+
define void @test_tpidr2_init() nounwind "aarch64_inout_za" {
10+
; CHECK-LABEL: test_tpidr2_init:
11+
; CHECK: // %bb.0:
12+
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
13+
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
14+
; CHECK-NEXT: mov x29, sp
15+
; CHECK-NEXT: sub sp, sp, #16
16+
; CHECK-NEXT: rdsvl x8, #1
17+
; CHECK-NEXT: mov x9, sp
18+
; CHECK-NEXT: msub x9, x8, x8, x9
19+
; CHECK-NEXT: mov sp, x9
20+
; CHECK-NEXT: sub x10, x29, #16
21+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
22+
; CHECK-NEXT: msr TPIDR2_EL0, x10
23+
; CHECK-NEXT: bl private_za_callee
24+
; CHECK-NEXT: smstart za
25+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
26+
; CHECK-NEXT: sub x0, x29, #16
27+
; CHECK-NEXT: cbnz x8, .LBB0_2
28+
; CHECK-NEXT: // %bb.1:
29+
; CHECK-NEXT: bl __arm_tpidr2_restore
30+
; CHECK-NEXT: .LBB0_2:
31+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
32+
; CHECK-NEXT: mov sp, x29
33+
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
34+
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
35+
; CHECK-NEXT: ret
36+
;
37+
; CHECK-BE-LABEL: test_tpidr2_init:
38+
; CHECK-BE: // %bb.0:
39+
; CHECK-BE-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
40+
; CHECK-BE-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
41+
; CHECK-BE-NEXT: mov x29, sp
42+
; CHECK-BE-NEXT: sub sp, sp, #16
43+
; CHECK-BE-NEXT: rdsvl x8, #1
44+
; CHECK-BE-NEXT: mov x9, sp
45+
; CHECK-BE-NEXT: msub x9, x8, x8, x9
46+
; CHECK-BE-NEXT: mov sp, x9
47+
; CHECK-BE-NEXT: lsl x8, x8, #48
48+
; CHECK-BE-NEXT: sub x10, x29, #16
49+
; CHECK-BE-NEXT: stp x9, x8, [x29, #-16]
50+
; CHECK-BE-NEXT: msr TPIDR2_EL0, x10
51+
; CHECK-BE-NEXT: bl private_za_callee
52+
; CHECK-BE-NEXT: smstart za
53+
; CHECK-BE-NEXT: mrs x8, TPIDR2_EL0
54+
; CHECK-BE-NEXT: sub x0, x29, #16
55+
; CHECK-BE-NEXT: cbnz x8, .LBB0_2
56+
; CHECK-BE-NEXT: // %bb.1:
57+
; CHECK-BE-NEXT: bl __arm_tpidr2_restore
58+
; CHECK-BE-NEXT: .LBB0_2:
59+
; CHECK-BE-NEXT: msr TPIDR2_EL0, xzr
60+
; CHECK-BE-NEXT: mov sp, x29
61+
; CHECK-BE-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
62+
; CHECK-BE-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
63+
; CHECK-BE-NEXT: ret
64+
call void @private_za_callee()
65+
ret void
66+
}

llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,18 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
2121
; CHECK-NEXT: .cfi_offset w29, -16
2222
; CHECK-NEXT: rdsvl x8, #1
2323
; CHECK-NEXT: mov x9, sp
24-
; CHECK-NEXT: msub x8, x8, x8, x9
25-
; CHECK-NEXT: mov sp, x8
26-
; CHECK-NEXT: stur x8, [x29, #-16]
27-
; CHECK-NEXT: sturh wzr, [x29, #-6]
28-
; CHECK-NEXT: stur wzr, [x29, #-4]
24+
; CHECK-NEXT: msub x9, x8, x8, x9
25+
; CHECK-NEXT: mov sp, x9
26+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
2927
; CHECK-NEXT: cbz w0, .LBB1_2
3028
; CHECK-NEXT: // %bb.1: // %use_b
3129
; CHECK-NEXT: fmov s1, #4.00000000
3230
; CHECK-NEXT: fadd s0, s0, s1
3331
; CHECK-NEXT: b .LBB1_5
3432
; CHECK-NEXT: .LBB1_2: // %use_c
3533
; CHECK-NEXT: fmov s0, s1
36-
; CHECK-NEXT: rdsvl x8, #1
37-
; CHECK-NEXT: sub x9, x29, #16
38-
; CHECK-NEXT: sturh w8, [x29, #-8]
39-
; CHECK-NEXT: msr TPIDR2_EL0, x9
34+
; CHECK-NEXT: sub x8, x29, #16
35+
; CHECK-NEXT: msr TPIDR2_EL0, x8
4036
; CHECK-NEXT: bl cosf
4137
; CHECK-NEXT: smstart za
4238
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -77,31 +73,27 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
7773
; CHECK-NEXT: .cfi_offset w29, -16
7874
; CHECK-NEXT: rdsvl x8, #1
7975
; CHECK-NEXT: mov x9, sp
80-
; CHECK-NEXT: msub x8, x8, x8, x9
76+
; CHECK-NEXT: msub x9, x8, x8, x9
8177
; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
8278
; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
83-
; CHECK-NEXT: cmp sp, x8
79+
; CHECK-NEXT: cmp sp, x9
8480
; CHECK-NEXT: b.le .LBB2_3
8581
; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
8682
; CHECK-NEXT: str xzr, [sp]
8783
; CHECK-NEXT: b .LBB2_1
8884
; CHECK-NEXT: .LBB2_3:
89-
; CHECK-NEXT: mov sp, x8
85+
; CHECK-NEXT: mov sp, x9
9086
; CHECK-NEXT: ldr xzr, [sp]
91-
; CHECK-NEXT: stur x8, [x29, #-16]
92-
; CHECK-NEXT: sturh wzr, [x29, #-6]
93-
; CHECK-NEXT: stur wzr, [x29, #-4]
87+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
9488
; CHECK-NEXT: cbz w0, .LBB2_5
9589
; CHECK-NEXT: // %bb.4: // %use_b
9690
; CHECK-NEXT: fmov s1, #4.00000000
9791
; CHECK-NEXT: fadd s0, s0, s1
9892
; CHECK-NEXT: b .LBB2_8
9993
; CHECK-NEXT: .LBB2_5: // %use_c
10094
; CHECK-NEXT: fmov s0, s1
101-
; CHECK-NEXT: rdsvl x8, #1
102-
; CHECK-NEXT: sub x9, x29, #16
103-
; CHECK-NEXT: sturh w8, [x29, #-8]
104-
; CHECK-NEXT: msr TPIDR2_EL0, x9
95+
; CHECK-NEXT: sub x8, x29, #16
96+
; CHECK-NEXT: msr TPIDR2_EL0, x8
10597
; CHECK-NEXT: bl cosf
10698
; CHECK-NEXT: smstart za
10799
; CHECK-NEXT: mrs x8, TPIDR2_EL0

0 commit comments

Comments
 (0)