From 0c0a0d795a83fd0121f0bf243abddee3cf3714f9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 31 Oct 2025 20:33:25 -0700 Subject: [PATCH 1/5] BasicTTI: Cleanup multiple result intrinsic handling Avoid weird lambda returning function pointer and sink the libcall logic to where the operation is handled. This allows chaining the libcall logic to try sincos_stret and fallback to sincos. The resulting cost seems too low. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 60 ++++++++++++------- .../test/Analysis/CostModel/AArch64/sincos.ll | 21 +++++-- 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index e8dbc964a943e..bbce59b71edae 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -302,7 +302,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { /// (e.g. scalarization). std::optional getMultipleResultIntrinsicVectorLibCallCost( const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind, - RTLIB::Libcall LC, std::optional CallRetElementIndex = {}) const { Type *RetTy = ICA.getReturnType(); // Vector variants of the intrinsic can be mapped to a vector library call. @@ -311,11 +310,43 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { !isVectorizedStructTy(cast(RetTy))) return std::nullopt; + Type *Ty = getContainedTypes(RetTy).front(); + EVT VT = getTLI()->getValueType(DL, Ty); + + EVT ScalarVT = VT.getScalarType(); + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + + bool UsesMemoryOutArgument = true; + + switch (ICA.getID()) { + case Intrinsic::modf: + LC = RTLIB::getMODF(ScalarVT); + break; + case Intrinsic::sincospi: + LC = RTLIB::getSINCOSPI(ScalarVT); + break; + case Intrinsic::sincos: + LC = RTLIB::getSINCOS_STRET(ScalarVT); + UsesMemoryOutArgument = false; + + if (getTLI()->getLibcallImpl(LC) == RTLIB::Unsupported) { + LC = RTLIB::getSINCOS(ScalarVT); + UsesMemoryOutArgument = true; + } + + break; + default: + return std::nullopt; + } + // Find associated libcall. - const char *LCName = getTLI()->getLibcallName(LC); - if (!LCName) + RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC); + if (LibcallImpl == RTLIB::Unsupported) return std::nullopt; + StringRef LCName = + RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpl); + // Search for a corresponding vector variant. LLVMContext &Ctx = RetTy->getContext(); ElementCount VF = getVectorizedTypeVF(RetTy); @@ -336,6 +367,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { VecTy, {}, CostKind, 0, nullptr, {}); } + // Technically this depends on the ABI, but assume sincos_stret passes in + // registers. + if (!UsesMemoryOutArgument) + return Cost; + // Lowering to a library call (with output pointers) may require us to emit // reloads for the results. for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) { @@ -2137,22 +2173,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case Intrinsic::modf: case Intrinsic::sincos: case Intrinsic::sincospi: { - Type *Ty = getContainedTypes(RetTy).front(); - EVT VT = getTLI()->getValueType(DL, Ty); - - RTLIB::Libcall LC = [&] { - switch (ICA.getID()) { - case Intrinsic::modf: - return RTLIB::getMODF; - case Intrinsic::sincos: - return RTLIB::getSINCOS; - case Intrinsic::sincospi: - return RTLIB::getSINCOSPI; - default: - llvm_unreachable("unexpected intrinsic"); - } - }()(VT.getScalarType()); - std::optional CallRetElementIndex; // The first element of the modf result is returned by value in the // libcall. @@ -2160,7 +2180,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { CallRetElementIndex = 0; if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost( - ICA, CostKind, LC, CallRetElementIndex)) + ICA, CostKind, CallRetElementIndex)) return *Cost; // Otherwise, fallback to default scalarization cost. break; diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll index 32408acb582d0..72c8f2bbbf8cf 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos" ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB +; RUN: opt < %s -mtriple=arm64-apple-macos10.9 -mattr=+neon -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefix=SINCOS_STRET %s define void @sincos() { ; CHECK-LABEL: 'sincos' @@ -8,13 +9,11 @@ define void @sincos() { ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) @@ -26,18 +25,32 @@ define void @sincos() { ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { , } @llvm.sincos.nxv1f128( poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f32 = call { , } @llvm.sincos.nxv8f32( poison) +; +; SINCOS_STRET-LABEL: 'sincos' +; SINCOS_STRET: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv1f128 = call { , } @llvm.sincos.nxv1f128( poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f32 = call { , } @llvm.sincos.nxv8f32( poison) ; %f16 = call { half, half } @llvm.sincos.f16(half poison) %f32 = call { float, float } @llvm.sincos.f32(float poison) From eb7787445d83ad7ede9ac96c859052ffc1bd87a4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 1 Nov 2025 08:49:44 -0700 Subject: [PATCH 2/5] Drop UsesMemoryOutArgument part --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index bbce59b71edae..193e344bafc12 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -316,8 +316,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { EVT ScalarVT = VT.getScalarType(); RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; - bool UsesMemoryOutArgument = true; - switch (ICA.getID()) { case Intrinsic::modf: LC = RTLIB::getMODF(ScalarVT); @@ -326,13 +324,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { LC = RTLIB::getSINCOSPI(ScalarVT); break; case Intrinsic::sincos: + // TODO: Account for sincos_stret not always using a memory operation for + // the out argument LC = RTLIB::getSINCOS_STRET(ScalarVT); - UsesMemoryOutArgument = false; - if (getTLI()->getLibcallImpl(LC) == RTLIB::Unsupported) { + if (getTLI()->getLibcallImpl(LC) == RTLIB::Unsupported) LC = RTLIB::getSINCOS(ScalarVT); - UsesMemoryOutArgument = true; - } break; default: @@ -367,11 +364,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { VecTy, {}, CostKind, 0, nullptr, {}); } - // Technically this depends on the ABI, but assume sincos_stret passes in - // registers. - if (!UsesMemoryOutArgument) - return Cost; - // Lowering to a library call (with output pointers) may require us to emit // reloads for the results. for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) { From 400e651f0dcb33438d66dd04e9b4fc666413e2f6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 1 Nov 2025 08:52:46 -0700 Subject: [PATCH 3/5] Add fixme --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 193e344bafc12..afbd921f4c1e7 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -345,6 +345,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpl); // Search for a corresponding vector variant. + // + // FIXME: CodeGen use RuntimeLibcallsInfo, not TargetLibraryInfo and has no + // path to using the vector libcalls. So this guess at how legalization will + // work is just wrong. LLVMContext &Ctx = RetTy->getContext(); ElementCount VF = getVectorizedTypeVF(RetTy); VecDesc const *VD = nullptr; From 600551a30b0dfa7297a26071afe43affc0336691 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 3 Nov 2025 20:00:01 -0800 Subject: [PATCH 4/5] drop sincos_stret part since this is for vectors --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index afbd921f4c1e7..0cb41550b1168 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -324,13 +324,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { LC = RTLIB::getSINCOSPI(ScalarVT); break; case Intrinsic::sincos: - // TODO: Account for sincos_stret not always using a memory operation for - // the out argument - LC = RTLIB::getSINCOS_STRET(ScalarVT); - - if (getTLI()->getLibcallImpl(LC) == RTLIB::Unsupported) - LC = RTLIB::getSINCOS(ScalarVT); - + LC = RTLIB::getSINCOS(ScalarVT); break; default: return std::nullopt; From f7f08b26649e0c8befd814582d820d785144de01 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 3 Nov 2025 20:16:05 -0800 Subject: [PATCH 5/5] Reword comment --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 0cb41550b1168..221d8f1e2f673 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -340,9 +340,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Search for a corresponding vector variant. // - // FIXME: CodeGen use RuntimeLibcallsInfo, not TargetLibraryInfo and has no - // path to using the vector libcalls. So this guess at how legalization will - // work is just wrong. + // FIXME: Should use RuntimeLibcallsInfo, not TargetLibraryInfo to get the + // vector mapping. LLVMContext &Ctx = RetTy->getContext(); ElementCount VF = getVectorizedTypeVF(RetTy); VecDesc const *VD = nullptr;