From 7e706b4bc2c0534de93762d73aeaf46ce9fc9d5e Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 12 Sep 2024 13:52:24 +0000 Subject: [PATCH 1/7] Precommit test for sincos stack slots --- .../CodeGen/AArch64/sincos-stack-slots.ll | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sincos-stack-slots.ll diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll new file mode 100644 index 0000000000000..9c362ba117fef --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll @@ -0,0 +1,152 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +define { float, float } @sincos_f32_value_return(float %x) { +; CHECK-LABEL: sincos_f32_value_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + %ret_0 = insertvalue { float, float } poison, float %sin, 0 + %ret_1 = insertvalue { float, float } %ret_0, float %cos, 1 + ret { float, float } %ret_1 +} + +define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: sincos_f32_ptr_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: str s0, [x20] +; CHECK-NEXT: str s1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + store float %cos, ptr %out_cos, align 4 + ret void +} + +define float @sincos_f32_mixed_return(float %x, ptr %out_sin) { +; CHECK-LABEL: sincos_f32_mixed_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s0, s1, [sp, #8] +; CHECK-NEXT: str s1, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + ret float %cos +} + +define { double, double } @sincos_f64_value_return(double %x) { +; CHECK-LABEL: sincos_f64_value_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + %ret_0 = insertvalue { double, double } poison, double %sin, 0 + %ret_1 = insertvalue { double, double } %ret_0, double %cos, 1 + ret { double, double } %ret_1 +} + +define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: sincos_f64_ptr_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [x20] +; CHECK-NEXT: str d1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 4 + store double %cos, ptr %out_cos, align 4 + ret void +} + +define double @sincos_f64_mixed_return(double %x, ptr %out_sin) { +; CHECK-LABEL: sincos_f64_mixed_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: add x0, sp, #8 +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldp d0, d1, [sp] +; CHECK-NEXT: str d1, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 4 + ret double %cos +} From 375204759512e87b77bfa39dfb2066e3059ff05e Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 12 Sep 2024 13:55:25 +0000 Subject: [PATCH 2/7] [SDAG] Avoid creating redundant stack slots when lowering FSINCOS When lowering `FSINCOS` to a library call (that takes output pointers) we can avoid creating new stack allocations if the results of the `FSINCOS` are being stored. Instead, we can take the destination pointers from the stores and pass those to the library call. --- .../include/llvm/CodeGen/RuntimeLibcallUtil.h | 4 + llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 99 +++++++++---------- llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 + .../CodeGen/AArch64/sincos-stack-slots.ll | 73 ++++---------- 4 files changed, 75 insertions(+), 106 deletions(-) diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h index 7a13164589392..045ec7d365311 100644 --- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h +++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h @@ -62,6 +62,10 @@ Libcall getLDEXP(EVT RetVT); /// UNKNOWN_LIBCALL if there is none. Libcall getFREXP(EVT RetVT); +/// getFSINCOS - Return the FSINCOS_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFSINCOS(EVT RetVT); + /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or /// UNKNOWN_LIBCALL if there is none. Libcall getSYNC(unsigned Opc, MVT VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index f5fbc01cd95e9..7b0dc63c473d2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2326,15 +2326,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, /// Return true if sincos libcall is available. static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) { - RTLIB::Libcall LC; - switch (Node->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unexpected request for libcall!"); - case MVT::f32: LC = RTLIB::SINCOS_F32; break; - case MVT::f64: LC = RTLIB::SINCOS_F64; break; - case MVT::f80: LC = RTLIB::SINCOS_F80; break; - case MVT::f128: LC = RTLIB::SINCOS_F128; break; - case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break; - } + RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getSimpleValueType(0).SimpleTy); return TLI.getLibcallName(LC) != nullptr; } @@ -2355,68 +2347,73 @@ static bool useSinCos(SDNode *Node) { } /// Issue libcalls to sincos to compute sin / cos pairs. -void -SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, - SmallVectorImpl &Results) { - RTLIB::Libcall LC; - switch (Node->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unexpected request for libcall!"); - case MVT::f32: LC = RTLIB::SINCOS_F32; break; - case MVT::f64: LC = RTLIB::SINCOS_F64; break; - case MVT::f80: LC = RTLIB::SINCOS_F80; break; - case MVT::f128: LC = RTLIB::SINCOS_F128; break; - case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break; - } - - // The input chain to this libcall is the entry node of the function. - // Legalizing the call will automatically add the previous call to the - // dependence. - SDValue InChain = DAG.getEntryNode(); - +void SelectionDAGLegalize::ExpandSinCosLibCall( + SDNode *Node, SmallVectorImpl &Results) { EVT RetVT = Node->getValueType(0); Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; + TargetLowering::ArgListEntry Entry{}; + + // Find users of the node that store the results. The destination pointers + // can be used instead of creating stack allocations. + StoreSDNode *SinST = nullptr; + StoreSDNode *CosST = nullptr; + for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end(); + UI != UE; ++UI) { + SDUse &Use = UI.getUse(); + SDNode *User = Use.getUser(); + if (!ISD::isNormalStore(User)) + continue; + auto *ST = cast(User); + if (Use.getResNo() == 0) + SinST = ST; + if (Use.getResNo() == 1) + CosST = ST; + } // Pass the argument. Entry.Node = Node->getOperand(0); Entry.Ty = RetTy; - Entry.IsSExt = false; - Entry.IsZExt = false; Args.push_back(Entry); + auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) { + if (MaybeStore) + return std::make_pair(MaybeStore->getBasePtr(), + MaybeStore->getPointerInfo()); + SDValue StackSlot = DAG.CreateStackTemporary(RetVT); + auto PtrInfo = MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + cast(StackSlot)->getIndex()); + return std::make_pair(StackSlot, PtrInfo); + }; + // Pass the return address of sin. - SDValue SinPtr = DAG.CreateStackTemporary(RetVT); - Entry.Node = SinPtr; + auto SinPtr = GetOrCreateOutPointer(SinST); + Entry.Node = SinPtr.first; Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Entry.IsSExt = false; - Entry.IsZExt = false; Args.push_back(Entry); // Also pass the return address of the cos. - SDValue CosPtr = DAG.CreateStackTemporary(RetVT); - Entry.Node = CosPtr; + auto CosPtr = GetOrCreateOutPointer(CosST); + Entry.Node = CosPtr.first; Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Entry.IsSExt = false; - Entry.IsZExt = false; Args.push_back(Entry); - SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), - TLI.getPointerTy(DAG.getDataLayout())); - - SDLoc dl(Node); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain).setLibCallee( - TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, - std::move(Args)); + RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT); + auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false); - std::pair CallInfo = TLI.LowerCallTo(CLI); + // Replace explict stores with the library call. + for (StoreSDNode *ST : {SinST, CosST}) { + if (ST) + DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain); + } - Results.push_back( - DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo())); - Results.push_back( - DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo())); + SDLoc DL(Node); + for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) { + SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo); + Results.push_back(LoadExp); + } } SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9fdde45455917..1f49d60c97059 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -400,6 +400,11 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) { FREXP_PPCF128); } +RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) { + return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128, + SINCOS_PPCF128); +} + RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4], AtomicOrdering Order, uint64_t MemSize) { diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll index 9c362ba117fef..afd054a83a501 100644 --- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll +++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll @@ -24,22 +24,11 @@ entry: define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) { ; CHECK-LABEL: sincos_f32_ptr_return: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x1 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: add x0, sp, #12 -; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldp s1, s0, [sp, #8] -; CHECK-NEXT: str s0, [x20] -; CHECK-NEXT: str s1, [x19] -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %sin = tail call float @llvm.sin.f32(float %x) @@ -52,19 +41,13 @@ entry: define float @sincos_f32_mixed_return(float %x, ptr %out_sin) { ; CHECK-LABEL: sincos_f32_mixed_return: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #12 -; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: add x1, sp, #12 ; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldp s0, s1, [sp, #8] -; CHECK-NEXT: str s1, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %sin = tail call float @llvm.sin.f32(float %x) @@ -99,25 +82,11 @@ entry: define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) { ; CHECK-LABEL: sincos_f64_ptr_return: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x1 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: add x0, sp, #24 -; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldr d0, [sp, #24] -; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: str d0, [x20] -; CHECK-NEXT: str d1, [x19] -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %sin = tail call double @llvm.sin.f64(double %x) @@ -130,19 +99,13 @@ entry: define double @sincos_f64_mixed_return(double %x, ptr %out_sin) { ; CHECK-LABEL: sincos_f64_mixed_return: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: add x1, sp, #8 ; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldp d0, d1, [sp] -; CHECK-NEXT: str d1, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %sin = tail call double @llvm.sin.f64(double %x) From 988273fd5a48e153344f1c090655abec33fd0f8f Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 12 Sep 2024 18:16:46 +0000 Subject: [PATCH 3/7] Fixups --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 49 ++++++--- .../CodeGen/AArch64/sincos-stack-slots.ll | 99 ++++++++++++++++++- 2 files changed, 131 insertions(+), 17 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 7b0dc63c473d2..aa91d064db48e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2351,9 +2351,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( SDNode *Node, SmallVectorImpl &Results) { EVT RetVT = Node->getValueType(0); Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry{}; + RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT); // Find users of the node that store the results. The destination pointers // can be used instead of creating stack allocations. @@ -2366,17 +2364,15 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( if (!ISD::isNormalStore(User)) continue; auto *ST = cast(User); + if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 || + ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy)) + continue; if (Use.getResNo() == 0) SinST = ST; if (Use.getResNo() == 1) CosST = ST; } - // Pass the argument. - Entry.Node = Node->getOperand(0); - Entry.Ty = RetTy; - Args.push_back(Entry); - auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) { if (MaybeStore) return std::make_pair(MaybeStore->getBasePtr(), @@ -2388,6 +2384,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( return std::make_pair(StackSlot, PtrInfo); }; + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry{}; + + // Pass the argument. + Entry.Node = Node->getOperand(0); + Entry.Ty = RetTy; + Args.push_back(Entry); + // Pass the return address of sin. auto SinPtr = GetOrCreateOutPointer(SinST); Entry.Node = SinPtr.first; @@ -2400,18 +2404,35 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( Entry.Ty = PointerType::getUnqual(RetTy->getContext()); Args.push_back(Entry); - RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT); - auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false); - - // Replace explict stores with the library call. + // Combine any input chains from the stores. + SmallVector InChains{}; for (StoreSDNode *ST : {SinST, CosST}) { if (ST) - DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain); + InChains.push_back(ST->getChain()); } + if (InChains.empty()) + InChains.push_back(DAG.getEntryNode()); SDLoc DL(Node); + SDValue InChain = DAG.getTokenFactor(DL, InChains); + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy(DAG.getDataLayout())); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(InChain).setLibCallee( + TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, + std::move(Args)); + + auto [Call, OutChain] = TLI.LowerCallTo(CLI); + + // Replace the stores with the library call. + for (StoreSDNode *ST : {SinST, CosST}) { + if (!ST) + continue; + DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); + } + for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) { - SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo); + SDValue LoadExp = DAG.getLoad(RetVT, DL, OutChain, Ptr, PtrInfo); Results.push_back(LoadExp); } } diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll index afd054a83a501..c32ac58a6a851 100644 --- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll +++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s +; This file tests eliding stack slots when lowering the FSINCOS ISD node. + define { float, float } @sincos_f32_value_return(float %x) { ; CHECK-LABEL: sincos_f32_value_return: ; CHECK: // %bb.0: // %entry @@ -91,8 +93,8 @@ define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) { entry: %sin = tail call double @llvm.sin.f64(double %x) %cos = tail call double @llvm.cos.f64(double %x) - store double %sin, ptr %out_sin, align 4 - store double %cos, ptr %out_cos, align 4 + store double %sin, ptr %out_sin, align 8 + store double %cos, ptr %out_cos, align 8 ret void } @@ -110,6 +112,97 @@ define double @sincos_f64_mixed_return(double %x, ptr %out_sin) { entry: %sin = tail call double @llvm.sin.f64(double %x) %cos = tail call double @llvm.cos.f64(double %x) - store double %sin, ptr %out_sin, align 4 + store double %sin, ptr %out_sin, align 8 ret double %cos } + +; Negative test. We can't fold volatile stores into the library call. +define void @sincos_volatile_result_stores(float %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: negative_fold_sincos_volatile_store: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: str s0, [x20] +; CHECK-NEXT: str s1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store volatile float %sin, ptr %out_sin, align 4 + store volatile float %cos, ptr %out_cos, align 4 + ret void +} + +; Negative test. We can't fold atomic stores into the library call. +define void @sincos_atomic_result_stores(float %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: negative_fold_sincos_atomic_store: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr w8, [sp, #12] +; CHECK-NEXT: str w8, [x20] +; CHECK-NEXT: ldr w8, [sp, #8] +; CHECK-NEXT: str w8, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store atomic float %sin, ptr %out_sin unordered, align 4 + store atomic float %cos, ptr %out_cos unordered, align 4 + ret void +} + +; Negative test. We can't fold misaligned stores into the library call. +define void @sincos_misaligned_result_stores(double %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: negative_sincos_bad_alignment: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [x20] +; CHECK-NEXT: str d1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 4 + store double %cos, ptr %out_cos, align 4 + ret void +} From 529a94e350f77e9270cd33922dba77f9b421e226 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 12 Sep 2024 20:43:05 +0000 Subject: [PATCH 4/7] Avoid cyclic in chains --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 83 ++++++++----------- .../CodeGen/AArch64/sincos-stack-slots.ll | 6 +- 2 files changed, 39 insertions(+), 50 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index aa91d064db48e..583859deb7ee7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2355,8 +2355,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( // Find users of the node that store the results. The destination pointers // can be used instead of creating stack allocations. - StoreSDNode *SinST = nullptr; - StoreSDNode *CosST = nullptr; + std::array ResultStores = {nullptr}; for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) { SDUse &Use = UI.getUse(); @@ -2367,22 +2366,18 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 || ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy)) continue; - if (Use.getResNo() == 0) - SinST = ST; - if (Use.getResNo() == 1) - CosST = ST; - } - - auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) { - if (MaybeStore) - return std::make_pair(MaybeStore->getBasePtr(), - MaybeStore->getPointerInfo()); - SDValue StackSlot = DAG.CreateStackTemporary(RetVT); - auto PtrInfo = MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), - cast(StackSlot)->getIndex()); - return std::make_pair(StackSlot, PtrInfo); - }; + ResultStores[Use.getResNo()] = ST; + } + + // Collect input chains (and avoid chains referring to one of the stores). + SmallVector InChains; + for (auto [ResNum, ST] : llvm::enumerate(ResultStores)) { + unsigned OtherResNum = ResNum == 0 ? 1 : 0; + if (ST && ST->getChain().getNode() != ResultStores[OtherResNum]) + InChains.push_back(ST->getChain()); + } + if (InChains.empty()) + InChains.push_back(DAG.getEntryNode()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry{}; @@ -2392,28 +2387,19 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( Entry.Ty = RetTy; Args.push_back(Entry); - // Pass the return address of sin. - auto SinPtr = GetOrCreateOutPointer(SinST); - Entry.Node = SinPtr.first; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Args.push_back(Entry); - - // Also pass the return address of the cos. - auto CosPtr = GetOrCreateOutPointer(CosST); - Entry.Node = CosPtr.first; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Args.push_back(Entry); - - // Combine any input chains from the stores. - SmallVector InChains{}; - for (StoreSDNode *ST : {SinST, CosST}) { - if (ST) - InChains.push_back(ST->getChain()); + // Pass the output pointers for sin and cos. + SmallVector ResultPtrs{}; + for (StoreSDNode *ST : ResultStores) { + SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(RetVT); + Entry.Node = ResultPtr; + Entry.Ty = PointerType::getUnqual(RetTy->getContext()); + Args.push_back(Entry); + ResultPtrs.push_back(ResultPtr); } - if (InChains.empty()) - InChains.push_back(DAG.getEntryNode()); SDLoc DL(Node); + + // Combine any input chains from the stores. SDValue InChain = DAG.getTokenFactor(DL, InChains); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); @@ -2424,16 +2410,19 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( auto [Call, OutChain] = TLI.LowerCallTo(CLI); - // Replace the stores with the library call. - for (StoreSDNode *ST : {SinST, CosST}) { - if (!ST) - continue; - DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); - } - - for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) { - SDValue LoadExp = DAG.getLoad(RetVT, DL, OutChain, Ptr, PtrInfo); - Results.push_back(LoadExp); + for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) { + MachinePointerInfo PtrInfo; + if (StoreSDNode *ST = ResultStores[ResNo]) { + // Replace store with the library call. + DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); + PtrInfo = ST->getPointerInfo(); + } else { + PtrInfo = MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + cast(ResultPtr)->getIndex()); + } + SDValue LoadResult = DAG.getLoad(RetVT, DL, OutChain, ResultPtr, PtrInfo); + Results.push_back(LoadResult); } } diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll index c32ac58a6a851..697e9c3444f86 100644 --- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll +++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll @@ -118,7 +118,7 @@ entry: ; Negative test. We can't fold volatile stores into the library call. define void @sincos_volatile_result_stores(float %x, ptr %out_sin, ptr %out_cos) { -; CHECK-LABEL: negative_fold_sincos_volatile_store: +; CHECK-LABEL: sincos_volatile_result_stores: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill @@ -147,7 +147,7 @@ entry: ; Negative test. We can't fold atomic stores into the library call. define void @sincos_atomic_result_stores(float %x, ptr %out_sin, ptr %out_cos) { -; CHECK-LABEL: negative_fold_sincos_atomic_store: +; CHECK-LABEL: sincos_atomic_result_stores: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill @@ -177,7 +177,7 @@ entry: ; Negative test. We can't fold misaligned stores into the library call. define void @sincos_misaligned_result_stores(double %x, ptr %out_sin, ptr %out_cos) { -; CHECK-LABEL: negative_sincos_bad_alignment: +; CHECK-LABEL: sincos_misaligned_result_stores: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #48 ; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill From 235b629ca262e65ff880275e5f2b8fce8bb038ce Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 16 Sep 2024 10:45:53 +0000 Subject: [PATCH 5/7] Fixups --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 583859deb7ee7..e4a2cf5b38299 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2349,9 +2349,9 @@ static bool useSinCos(SDNode *Node) { /// Issue libcalls to sincos to compute sin / cos pairs. void SelectionDAGLegalize::ExpandSinCosLibCall( SDNode *Node, SmallVectorImpl &Results) { - EVT RetVT = Node->getValueType(0); - Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); - RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT); + EVT VT = Node->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + RTLIB::Libcall LC = RTLIB::getFSINCOS(VT); // Find users of the node that store the results. The destination pointers // can be used instead of creating stack allocations. @@ -2363,14 +2363,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( if (!ISD::isNormalStore(User)) continue; auto *ST = cast(User); - if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 || - ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy)) + if (!ST->isSimple() || ST->getAddressSpace() != 0 || + ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty)) continue; ResultStores[Use.getResNo()] = ST; } // Collect input chains (and avoid chains referring to one of the stores). - SmallVector InChains; + SmallVector InChains; for (auto [ResNum, ST] : llvm::enumerate(ResultStores)) { unsigned OtherResNum = ResNum == 0 ? 1 : 0; if (ST && ST->getChain().getNode() != ResultStores[OtherResNum]) @@ -2384,15 +2384,15 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( // Pass the argument. Entry.Node = Node->getOperand(0); - Entry.Ty = RetTy; + Entry.Ty = Ty; Args.push_back(Entry); // Pass the output pointers for sin and cos. SmallVector ResultPtrs{}; for (StoreSDNode *ST : ResultStores) { - SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(RetVT); + SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT); Entry.Node = ResultPtr; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); + Entry.Ty = PointerType::getUnqual(Ty->getContext()); Args.push_back(Entry); ResultPtrs.push_back(ResultPtr); } @@ -2421,7 +2421,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( DAG.getMachineFunction(), cast(ResultPtr)->getIndex()); } - SDValue LoadResult = DAG.getLoad(RetVT, DL, OutChain, ResultPtr, PtrInfo); + SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo); Results.push_back(LoadResult); } } From bf8f0d9a0ea46ff6115b341ee7b00e553ad7b675 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 17 Sep 2024 14:55:48 +0000 Subject: [PATCH 6/7] Use for-range loop --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index e4a2cf5b38299..0a2454e810433 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2356,17 +2356,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( // Find users of the node that store the results. The destination pointers // can be used instead of creating stack allocations. std::array ResultStores = {nullptr}; - for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end(); - UI != UE; ++UI) { - SDUse &Use = UI.getUse(); - SDNode *User = Use.getUser(); + for (SDNode *User : Node->uses()) { if (!ISD::isNormalStore(User)) continue; auto *ST = cast(User); if (!ST->isSimple() || ST->getAddressSpace() != 0 || ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty)) continue; - ResultStores[Use.getResNo()] = ST; + ResultStores[ST->getValue().getResNo()] = ST; } // Collect input chains (and avoid chains referring to one of the stores). From e15c4e3ab41d6300495a38b0989e2dc03e26cf19 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 23 Sep 2024 13:52:51 +0000 Subject: [PATCH 7/7] Fixups --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 24 +++----- .../CodeGen/AArch64/sincos-stack-slots.ll | 57 +++++++++++++++++-- 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 0a2454e810433..3c087727a8012 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2353,29 +2353,23 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( Type *Ty = VT.getTypeForEVT(*DAG.getContext()); RTLIB::Libcall LC = RTLIB::getFSINCOS(VT); - // Find users of the node that store the results. The destination pointers - // can be used instead of creating stack allocations. + // Find users of the node that store the results (and share input chains). The + // destination pointers can be used instead of creating stack allocations. + SDValue StoresInChain{}; std::array ResultStores = {nullptr}; for (SDNode *User : Node->uses()) { if (!ISD::isNormalStore(User)) continue; auto *ST = cast(User); if (!ST->isSimple() || ST->getAddressSpace() != 0 || - ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty)) + ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty) || + (StoresInChain && ST->getChain() != StoresInChain) || + Node->isPredecessorOf(ST->getChain().getNode())) continue; ResultStores[ST->getValue().getResNo()] = ST; + StoresInChain = ST->getChain(); } - // Collect input chains (and avoid chains referring to one of the stores). - SmallVector InChains; - for (auto [ResNum, ST] : llvm::enumerate(ResultStores)) { - unsigned OtherResNum = ResNum == 0 ? 1 : 0; - if (ST && ST->getChain().getNode() != ResultStores[OtherResNum]) - InChains.push_back(ST->getChain()); - } - if (InChains.empty()) - InChains.push_back(DAG.getEntryNode()); - TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry{}; @@ -2395,9 +2389,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall( } SDLoc DL(Node); - - // Combine any input chains from the stores. - SDValue InChain = DAG.getTokenFactor(DL, InChains); + SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode(); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll index 697e9c3444f86..8ef8b5d13b62d 100644 --- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll +++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll @@ -23,7 +23,7 @@ entry: ret { float, float } %ret_1 } -define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) { +define void @sincos_f32_ptr_return(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { ; CHECK-LABEL: sincos_f32_ptr_return: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill @@ -81,7 +81,7 @@ entry: ret { double, double } %ret_1 } -define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) { +define void @sincos_f64_ptr_return(double %x, ptr noalias %out_sin, ptr noalias %out_cos) { ; CHECK-LABEL: sincos_f64_ptr_return: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill @@ -116,8 +116,55 @@ entry: ret double %cos } +; Here %out_sin and %out_cos may alias so we can't replace both stores with the +; call to sincosf (as the order of stores in sincosf is not defined). +define void @sincos_may_alias(float %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: sincos_may_alias: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: add x1, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: str s0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + store float %cos, ptr %out_cos, align 4 + ret void +} + +; Here %out is used for both sin and cos (with the final value stored being cos). +define float @sincos_multiple_uses(float %x, ptr %out) { +; CHECK-LABEL: sincos_multiple_uses: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %sin = call float @llvm.sin.f32(float %x) + store float %sin, ptr %out, align 4 + %reload = load float, ptr %out, align 4 + %cos = call float @llvm.cos.f32(float %x) + store float %cos, ptr %out, align 4 + ret float %reload +} + ; Negative test. We can't fold volatile stores into the library call. -define void @sincos_volatile_result_stores(float %x, ptr %out_sin, ptr %out_cos) { +define void @sincos_volatile_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { ; CHECK-LABEL: sincos_volatile_result_stores: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill @@ -146,7 +193,7 @@ entry: } ; Negative test. We can't fold atomic stores into the library call. -define void @sincos_atomic_result_stores(float %x, ptr %out_sin, ptr %out_cos) { +define void @sincos_atomic_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { ; CHECK-LABEL: sincos_atomic_result_stores: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill @@ -176,7 +223,7 @@ entry: } ; Negative test. We can't fold misaligned stores into the library call. -define void @sincos_misaligned_result_stores(double %x, ptr %out_sin, ptr %out_cos) { +define void @sincos_misaligned_result_stores(double %x, ptr noalias %out_sin, ptr noalias %out_cos) { ; CHECK-LABEL: sincos_misaligned_result_stores: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #48