Fixups

MacDue · MacDue · commit 37f4aee204d5 · 2024-09-16T10:30:29.000Z
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2351,9 +2351,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   EVT RetVT = Node->getValueType(0);
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry{};
+  RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
 
   // Find users of the node that store the results. The destination pointers
   // can be used instead of creating stack allocations.
@@ -2366,17 +2364,15 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     if (!ISD::isNormalStore(User))
       continue;
     auto *ST = cast<StoreSDNode>(User);
+    if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 ||
+        ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy))
+      continue;
     if (Use.getResNo() == 0)
       SinST = ST;
     if (Use.getResNo() == 1)
       CosST = ST;
   }
 
-  // Pass the argument.
-  Entry.Node = Node->getOperand(0);
-  Entry.Ty = RetTy;
-  Args.push_back(Entry);
-
   auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) {
     if (MaybeStore)
       return std::make_pair(MaybeStore->getBasePtr(),
@@ -2388,6 +2384,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     return std::make_pair(StackSlot, PtrInfo);
   };
 
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry{};
+
+  // Pass the argument.
+  Entry.Node = Node->getOperand(0);
+  Entry.Ty = RetTy;
+  Args.push_back(Entry);
+
   // Pass the return address of sin.
   auto SinPtr = GetOrCreateOutPointer(SinST);
   Entry.Node = SinPtr.first;
@@ -2400,18 +2404,35 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
   Entry.Ty = PointerType::getUnqual(RetTy->getContext());
   Args.push_back(Entry);
 
-  RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
-  auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
-
-  // Replace explict stores with the library call.
+  // Combine any input chains from the stores.
+  SmallVector<SDValue, 2> InChains{};
   for (StoreSDNode *ST : {SinST, CosST}) {
     if (ST)
-      DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain);
+      InChains.push_back(ST->getChain());
   }
+  if (InChains.empty())
+    InChains.push_back(DAG.getEntryNode());
 
   SDLoc DL(Node);
+  SDValue InChain = DAG.getTokenFactor(DL, InChains);
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
+      std::move(Args));
+
+  auto [Call, OutChain] = TLI.LowerCallTo(CLI);
+
+  // Replace the stores with the library call.
+  for (StoreSDNode *ST : {SinST, CosST}) {
+    if (!ST)
+      continue;
+    DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+  }
+
   for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) {
-    SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo);
+    SDValue LoadExp = DAG.getLoad(RetVT, DL, OutChain, Ptr, PtrInfo);
     Results.push_back(LoadExp);
   }
 }
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
+; This file tests eliding stack slots when lowering the FSINCOS ISD node.
+
 define { float, float } @sincos_f32_value_return(float %x) {
 ; CHECK-LABEL: sincos_f32_value_return:
 ; CHECK:       // %bb.0: // %entry
@@ -91,8 +93,8 @@ define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
 entry:
   %sin = tail call double @llvm.sin.f64(double %x)
   %cos = tail call double @llvm.cos.f64(double %x)
-  store double %sin, ptr %out_sin, align 4
-  store double %cos, ptr %out_cos, align 4
+  store double %sin, ptr %out_sin, align 8
+  store double %cos, ptr %out_cos, align 8
   ret void
 }
 
@@ -110,6 +112,97 @@ define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
 entry:
   %sin = tail call double @llvm.sin.f64(double %x)
   %cos = tail call double @llvm.cos.f64(double %x)
-  store double %sin, ptr %out_sin, align 4
+  store double %sin, ptr %out_sin, align 8
   ret double %cos
 }
+
+; Negative test. We can't fold volatile stores into the library call.
+define void @sincos_volatile_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_fold_sincos_volatile_store:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    str s0, [x20]
+; CHECK-NEXT:    str s1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store volatile float %sin, ptr %out_sin, align 4
+  store volatile float %cos, ptr %out_cos, align 4
+  ret void
+}
+
+; Negative test. We can't fold atomic stores into the library call.
+define void @sincos_atomic_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_fold_sincos_atomic_store:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr w8, [sp, #12]
+; CHECK-NEXT:    str w8, [x20]
+; CHECK-NEXT:    ldr w8, [sp, #8]
+; CHECK-NEXT:    str w8, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store atomic float %sin, ptr %out_sin unordered, align 4
+  store atomic float %cos, ptr %out_cos unordered, align 4
+  ret void
+}
+
+; Negative test. We can't fold misaligned stores into the library call.
+define void @sincos_misaligned_result_stores(double %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_sincos_bad_alignment:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [x20]
+; CHECK-NEXT:    str d1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  store double %sin, ptr %out_sin, align 4
+  store double %cos, ptr %out_cos, align 4
+  ret void
+}