diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h index 7a13164589392..045ec7d365311 100644 --- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h +++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h @@ -62,6 +62,10 @@ Libcall getLDEXP(EVT RetVT); /// UNKNOWN_LIBCALL if there is none. Libcall getFREXP(EVT RetVT); +/// getFSINCOS - Return the FSINCOS_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFSINCOS(EVT RetVT); + /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or /// UNKNOWN_LIBCALL if there is none. Libcall getSYNC(unsigned Opc, MVT VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index f5fbc01cd95e9..3c087727a8012 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2326,15 +2326,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, /// Return true if sincos libcall is available. static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) { - RTLIB::Libcall LC; - switch (Node->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unexpected request for libcall!"); - case MVT::f32: LC = RTLIB::SINCOS_F32; break; - case MVT::f64: LC = RTLIB::SINCOS_F64; break; - case MVT::f80: LC = RTLIB::SINCOS_F80; break; - case MVT::f128: LC = RTLIB::SINCOS_F128; break; - case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break; - } + RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getSimpleValueType(0).SimpleTy); return TLI.getLibcallName(LC) != nullptr; } @@ -2355,68 +2347,72 @@ static bool useSinCos(SDNode *Node) { } /// Issue libcalls to sincos to compute sin / cos pairs. -void -SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, - SmallVectorImpl &Results) { - RTLIB::Libcall LC; - switch (Node->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unexpected request for libcall!"); - case MVT::f32: LC = RTLIB::SINCOS_F32; break; - case MVT::f64: LC = RTLIB::SINCOS_F64; break; - case MVT::f80: LC = RTLIB::SINCOS_F80; break; - case MVT::f128: LC = RTLIB::SINCOS_F128; break; - case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break; +void SelectionDAGLegalize::ExpandSinCosLibCall( + SDNode *Node, SmallVectorImpl &Results) { + EVT VT = Node->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + RTLIB::Libcall LC = RTLIB::getFSINCOS(VT); + + // Find users of the node that store the results (and share input chains). The + // destination pointers can be used instead of creating stack allocations. + SDValue StoresInChain{}; + std::array ResultStores = {nullptr}; + for (SDNode *User : Node->uses()) { + if (!ISD::isNormalStore(User)) + continue; + auto *ST = cast(User); + if (!ST->isSimple() || ST->getAddressSpace() != 0 || + ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty) || + (StoresInChain && ST->getChain() != StoresInChain) || + Node->isPredecessorOf(ST->getChain().getNode())) + continue; + ResultStores[ST->getValue().getResNo()] = ST; + StoresInChain = ST->getChain(); } - // The input chain to this libcall is the entry node of the function. - // Legalizing the call will automatically add the previous call to the - // dependence. - SDValue InChain = DAG.getEntryNode(); - - EVT RetVT = Node->getValueType(0); - Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; + TargetLowering::ArgListEntry Entry{}; // Pass the argument. Entry.Node = Node->getOperand(0); - Entry.Ty = RetTy; - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); - - // Pass the return address of sin. - SDValue SinPtr = DAG.CreateStackTemporary(RetVT); - Entry.Node = SinPtr; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Entry.IsSExt = false; - Entry.IsZExt = false; + Entry.Ty = Ty; Args.push_back(Entry); - // Also pass the return address of the cos. - SDValue CosPtr = DAG.CreateStackTemporary(RetVT); - Entry.Node = CosPtr; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + // Pass the output pointers for sin and cos. + SmallVector ResultPtrs{}; + for (StoreSDNode *ST : ResultStores) { + SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT); + Entry.Node = ResultPtr; + Entry.Ty = PointerType::getUnqual(Ty->getContext()); + Args.push_back(Entry); + ResultPtrs.push_back(ResultPtr); + } + SDLoc DL(Node); + SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode(); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); - - SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain).setLibCallee( + CLI.setDebugLoc(DL).setChain(InChain).setLibCallee( TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args)); - std::pair CallInfo = TLI.LowerCallTo(CLI); + auto [Call, OutChain] = TLI.LowerCallTo(CLI); - Results.push_back( - DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo())); - Results.push_back( - DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo())); + for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) { + MachinePointerInfo PtrInfo; + if (StoreSDNode *ST = ResultStores[ResNo]) { + // Replace store with the library call. + DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); + PtrInfo = ST->getPointerInfo(); + } else { + PtrInfo = MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + cast(ResultPtr)->getIndex()); + } + SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo); + Results.push_back(LoadResult); + } } SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9fdde45455917..1f49d60c97059 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -400,6 +400,11 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) { FREXP_PPCF128); } +RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) { + return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128, + SINCOS_PPCF128); +} + RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4], AtomicOrdering Order, uint64_t MemSize) { diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll new file mode 100644 index 0000000000000..8ef8b5d13b62d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +; This file tests eliding stack slots when lowering the FSINCOS ISD node. + +define { float, float } @sincos_f32_value_return(float %x) { +; CHECK-LABEL: sincos_f32_value_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + %ret_0 = insertvalue { float, float } poison, float %sin, 0 + %ret_1 = insertvalue { float, float } %ret_0, float %cos, 1 + ret { float, float } %ret_1 +} + +define void @sincos_f32_ptr_return(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_f32_ptr_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + store float %cos, ptr %out_cos, align 4 + ret void +} + +define float @sincos_f32_mixed_return(float %x, ptr %out_sin) { +; CHECK-LABEL: sincos_f32_mixed_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x1, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + ret float %cos +} + +define { double, double } @sincos_f64_value_return(double %x) { +; CHECK-LABEL: sincos_f64_value_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + %ret_0 = insertvalue { double, double } poison, double %sin, 0 + %ret_1 = insertvalue { double, double } %ret_0, double %cos, 1 + ret { double, double } %ret_1 +} + +define void @sincos_f64_ptr_return(double %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_f64_ptr_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 8 + store double %cos, ptr %out_cos, align 8 + ret void +} + +define double @sincos_f64_mixed_return(double %x, ptr %out_sin) { +; CHECK-LABEL: sincos_f64_mixed_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 8 + ret double %cos +} + +; Here %out_sin and %out_cos may alias so we can't replace both stores with the +; call to sincosf (as the order of stores in sincosf is not defined). +define void @sincos_may_alias(float %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: sincos_may_alias: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: add x1, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: str s0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + store float %cos, ptr %out_cos, align 4 + ret void +} + +; Here %out is used for both sin and cos (with the final value stored being cos). +define float @sincos_multiple_uses(float %x, ptr %out) { +; CHECK-LABEL: sincos_multiple_uses: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %sin = call float @llvm.sin.f32(float %x) + store float %sin, ptr %out, align 4 + %reload = load float, ptr %out, align 4 + %cos = call float @llvm.cos.f32(float %x) + store float %cos, ptr %out, align 4 + ret float %reload +} + +; Negative test. We can't fold volatile stores into the library call. +define void @sincos_volatile_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_volatile_result_stores: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: str s0, [x20] +; CHECK-NEXT: str s1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store volatile float %sin, ptr %out_sin, align 4 + store volatile float %cos, ptr %out_cos, align 4 + ret void +} + +; Negative test. We can't fold atomic stores into the library call. +define void @sincos_atomic_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_atomic_result_stores: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr w8, [sp, #12] +; CHECK-NEXT: str w8, [x20] +; CHECK-NEXT: ldr w8, [sp, #8] +; CHECK-NEXT: str w8, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store atomic float %sin, ptr %out_sin unordered, align 4 + store atomic float %cos, ptr %out_cos unordered, align 4 + ret void +} + +; Negative test. We can't fold misaligned stores into the library call. +define void @sincos_misaligned_result_stores(double %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_misaligned_result_stores: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [x20] +; CHECK-NEXT: str d1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 4 + store double %cos, ptr %out_cos, align 4 + ret void +}