From 72ebc590df92d0bf168d45b84df0e01355f28e8e Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 19 Sep 2025 10:51:26 +0000
Subject: [PATCH 01/14] Add `llvm.vector.partial.reduction.fadd` intrinsic

With this intrinsic, and supporting SelectionDAG nodes, we can better make use of instructions such as AArch64's `FDOT`.
---
 llvm/docs/LangRef.rst                         | 42 ++++++++++++
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  3 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 +-
 llvm/include/llvm/IR/Intrinsics.td            |  4 ++
 .../include/llvm/Target/TargetSelectionDAG.td |  2 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++++--
 .../SelectionDAG/LegalizeVectorOps.cpp        |  2 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  2 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  3 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 13 ++++
 .../SelectionDAG/SelectionDAGDumper.cpp       |  2 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 21 +++---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  6 ++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  3 +
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 66 +++++++++++++++++++
 15 files changed, 178 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3c089b5a0ba79..9376e3fb326f5 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20780,6 +20780,48 @@ performance, and an out-of-loop phase to calculate the final scalar result.
 By avoiding the introduction of new ordering constraints, these intrinsics
 enhance the ability to leverage a target's accumulation instructions.
 
+'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.add.nxv4f32.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.partial.reduce.fadd.*``' intrinsics reduce the
+concatenation of the two vector arguments down to the number of elements of the
+result vector type.
+
+Arguments:
+""""""""""
+
+The first argument is a floating-point vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+Semantics:
+""""""""""
+
+Other than the reduction operator (e.g. add) the way in which the concatenated
+arguments is reduced is entirely unspecified. By their nature these intrinsics
+are not expected to be useful in isolation but instead implement the first phase
+of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase to calculate the final scalar result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ff3dd0d4c3c51..d5bb8e780d121 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1516,6 +1516,7 @@ enum NodeType {
   PARTIAL_REDUCE_SMLA,  // sext, sext
   PARTIAL_REDUCE_UMLA,  // zext, zext
   PARTIAL_REDUCE_SUMLA, // sext, zext
+  PARTIAL_REDUCE_FMLA,  // fpext, fpext
 
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
@@ -1767,7 +1768,7 @@ LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type);
 
 inline bool isExtOpcode(unsigned Opcode) {
   return Opcode == ISD::ANY_EXTEND || Opcode == ISD::ZERO_EXTEND ||
-         Opcode == ISD::SIGN_EXTEND;
+         Opcode == ISD::SIGN_EXTEND || Opcode == ISD::FP_EXTEND;
 }
 
 inline bool isExtVecInRegOpcode(unsigned Opcode) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 78f63b4406eb0..b5fc64a44cdcb 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1678,7 +1678,7 @@ class LLVM_ABI TargetLoweringBase {
   LegalizeAction getPartialReduceMLAAction(unsigned Opc, EVT AccVT,
                                            EVT InputVT) const {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     PartialReduceActionTypes Key = {Opc, AccVT.getSimpleVT().SimpleTy,
                                     InputVT.getSimpleVT().SimpleTy};
     auto It = PartialReduceMLAActions.find(Key);
@@ -2792,7 +2792,7 @@ class LLVM_ABI TargetLoweringBase {
   void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT,
                                  LegalizeAction Action) {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     assert(AccVT.isValid() && InputVT.isValid() &&
            "setPartialReduceMLAAction types aren't valid");
     PartialReduceActionTypes Key = {Opc, AccVT.SimpleTy, InputVT.SimpleTy};
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4d59ee8676b9e..124b4347d706e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2797,6 +2797,10 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
                                                           [IntrNoMem,
                                                            IntrSpeculatable]>;
 
+def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                                        [llvm_anyfloat_ty, llvm_anyfloat_ty],
+                                                                        [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..a9750a5ab03f9 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -527,6 +527,8 @@ def partial_reduce_smla : SDNode<"ISD::PARTIAL_REDUCE_SMLA",
                                  SDTPartialReduceMLA>;
 def partial_reduce_sumla : SDNode<"ISD::PARTIAL_REDUCE_SUMLA",
                                  SDTPartialReduceMLA>;
+def partial_reduce_fmla : SDNode<"ISD::PARTIAL_REDUCE_FMLA",
+                                 SDTPartialReduceMLA>;
 
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 46c4bb85a7420..78d8a8d264fe5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
                                 return visitPARTIAL_REDUCE_MLA(N);
   case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
@@ -13012,7 +13013,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDValue Op2 = N->getOperand(2);
 
   unsigned Opc = Op1->getOpcode();
-  if (Opc != ISD::MUL && Opc != ISD::SHL)
+  if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL)
     return SDValue();
 
   SDValue LHS = Op1->getOperand(0);
@@ -13032,8 +13033,11 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   APInt C;
-  if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) ||
-      !C.isOne())
+  if (!(Op1->getOpcode() == ISD::MUL &&
+        ISD::isConstantSplatVector(Op2.getNode(), C) && C.isOne()) &&
+      !(Op1->getOpcode() == ISD::FMUL &&
+        ISD::isConstantSplatVector(Op2.getNode(), C) &&
+        C == APFloat(1.0f).bitcastToAPInt().trunc(C.getBitWidth())))
     return SDValue();
 
   unsigned LHSOpcode = LHS->getOpcode();
@@ -13086,6 +13090,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) {
     NewOpc = ISD::PARTIAL_REDUCE_SUMLA;
     std::swap(LHSExtOp, RHSExtOp);
+  } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) {
+    NewOpc = ISD::PARTIAL_REDUCE_FMLA;
   } else
     return SDValue();
   // For a 2-stage extend the signedness of both of the extends must match
@@ -13121,22 +13127,26 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
 
   APInt ConstantOne;
   if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+      !(ConstantOne.isOne() ||
+        ConstantOne ==
+            APFloat(1.0f).bitcastToAPInt().trunc(ConstantOne.getBitWidth())))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
   if (!ISD::isExtOpcode(Op1Opcode))
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
+  bool Op1IsSigned = Op1Opcode != ISD::ZERO_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
   if (Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
-  unsigned NewOpcode =
-      Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+  unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                           ? ISD::PARTIAL_REDUCE_FMLA
+                       : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA
+                                     : ISD::PARTIAL_REDUCE_UMLA;
 
   SDValue UnextOp1 = Op1.getOperand(0);
   EVT UnextOp1VT = UnextOp1.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8e423c4f83b38..94751be5b7986 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Action =
         TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0),
                                       Node->getOperand(1).getValueType());
@@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Results.push_back(TLI.expandPartialReduceMLA(Node, DAG));
     return;
   case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index bb4a8d9967f94..dd5c011bfe784 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1474,6 +1474,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
     break;
   case ISD::GET_ACTIVE_LANE_MASK:
@@ -3689,6 +3690,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Res = SplitVecOp_PARTIAL_REDUCE_MLA(N);
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 379242ec5a157..1a45cbac6f622 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8404,7 +8404,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   }
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
-  case ISD::PARTIAL_REDUCE_SUMLA: {
+  case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA: {
     [[maybe_unused]] EVT AccVT = N1.getValueType();
     [[maybe_unused]] EVT Input1VT = N2.getValueType();
     [[maybe_unused]] EVT Input2VT = N3.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a52265055c88a..413f2e59c42fa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8137,6 +8137,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                          Input, DAG.getConstant(1, sdl, Input.getValueType())));
     return;
   }
+  case Intrinsic::vector_partial_reduce_fadd: {
+    if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+    SDValue Acc = getValue(I.getOperand(0));
+    SDValue Input = getValue(I.getOperand(1));
+    setValue(&I,
+             DAG.getNode(ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
+                         Input,
+                         DAG.getConstantFP(1.0f, sdl, Input.getValueType())));
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 77377d348b836..d9c654a4d23c4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -588,6 +588,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     return "partial_reduce_smla";
   case ISD::PARTIAL_REDUCE_SUMLA:
     return "partial_reduce_sumla";
+  case ISD::PARTIAL_REDUCE_FMLA:
+    return "partial_reduce_fmla";
   case ISD::LOOP_DEPENDENCE_WAR_MASK:
     return "loop_dep_war";
   case ISD::LOOP_DEPENDENCE_RAW_MASK:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9bdf82210fed1..04a37d5854163 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12074,12 +12074,14 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
       EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
                        MulOpVT.getVectorElementCount());
 
-  unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
-                      ? ISD::ZERO_EXTEND
-                      : ISD::SIGN_EXTEND;
-  unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA
-                      ? ISD::SIGN_EXTEND
-                      : ISD::ZERO_EXTEND;
+  unsigned ExtOpcLHS =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
+      : N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA ? ISD::ZERO_EXTEND
+                                                   : ISD::SIGN_EXTEND;
+  unsigned ExtOpcRHS =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
+      : N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA ? ISD::SIGN_EXTEND
+                                                   : ISD::ZERO_EXTEND;
 
   if (ExtMulOpVT != MulOpVT) {
     MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
@@ -12088,7 +12090,7 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   SDValue Input = MulLHS;
   APInt ConstantOne;
   if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+      !(ConstantOne.isOne() || ConstantOne == APFloat(1.0f).bitcastToAPInt()))
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
 
   unsigned Stride = AccVT.getVectorMinNumElements();
@@ -12099,10 +12101,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   for (unsigned I = 0; I < ScaleFactor; I++)
     Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride));
 
+  unsigned FlatNode =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD;
+
   // Flatten the subvector tree
   while (Subvectors.size() > 1) {
     Subvectors.push_back(
-        DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]}));
+        DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]}));
     Subvectors.pop_front();
     Subvectors.pop_front();
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..9597428375d55 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1923,6 +1923,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
+  // Handle floating-point partial reduction
+  if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
+    static const unsigned FMLAOps[] = {ISD::PARTIAL_REDUCE_FMLA};
+    setPartialReduceMLAAction(FMLAOps, MVT::nxv4f32, MVT::nxv8f16, Legal);
+  }
+
   // Handle non-aliasing elements mask
   if (Subtarget->hasSVE2() ||
       (Subtarget->hasSME() && Subtarget->isStreaming())) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..b620b59d24dfd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4254,6 +4254,9 @@ defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
+def : Pat<(nxv4f32 (partial_reduce_fmla nxv4f32:$Acc, nxv8f16:$LHS, nxv8f16:$RHS)),
+          (FDOT_ZZZ_S $Acc, $LHS, $RHS)>;
+
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
 defm BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslt>;
 defm BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb_lane>;
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
new file mode 100644
index 0000000000000..5bb1fae43392f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+
+define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fdot_wide_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
+  %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
+; CHECK-LABEL: fdot_wide_vl256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
+; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    fmul z2.s, z2.s, z3.s
+; CHECK-NEXT:    fadd z0.s, z1.s, z0.s
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %acc = load <8 x float>, ptr %accptr
+  %a = load <16 x half>, ptr %aptr
+  %b = load <16 x half>, ptr %bptr
+  %a.wide = fpext <16 x half> %a to <16 x float>
+  %b.wide = fpext <16 x half> %b to <16 x float>
+  %mult = fmul <16 x float> %a.wide, %b.wide
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  store <8 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: fixed_fdot_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <8 x half> %a to <8 x float>
+  %b.wide = fpext <8 x half> %b to <8 x float>
+  %mult = fmul <8 x float> %a.wide, %b.wide
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  ret <4 x float> %partial.reduce
+}

From 4f4a319b17ffb1fc5acec75d51fc97bcf8ac1891 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 19 Sep 2025 16:00:49 +0000
Subject: [PATCH 02/14] Revert adding `FP_EXTEND` to `isExtOpcode`

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        | 2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d5bb8e780d121..1a3fd27e64c4f 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1768,7 +1768,7 @@ LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type);
 
 inline bool isExtOpcode(unsigned Opcode) {
   return Opcode == ISD::ANY_EXTEND || Opcode == ISD::ZERO_EXTEND ||
-         Opcode == ISD::SIGN_EXTEND || Opcode == ISD::FP_EXTEND;
+         Opcode == ISD::SIGN_EXTEND;
 }
 
 inline bool isExtVecInRegOpcode(unsigned Opcode) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 78d8a8d264fe5..14cae1644f249 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13041,7 +13041,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
     return SDValue();
 
   unsigned LHSOpcode = LHS->getOpcode();
-  if (!ISD::isExtOpcode(LHSOpcode))
+  if (!ISD::isExtOpcode(LHSOpcode) && LHSOpcode != ISD::FP_EXTEND)
     return SDValue();
 
   SDValue LHSExtOp = LHS->getOperand(0);
@@ -13073,7 +13073,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   unsigned RHSOpcode = RHS->getOpcode();
-  if (!ISD::isExtOpcode(RHSOpcode))
+  if (!ISD::isExtOpcode(RHSOpcode) && RHSOpcode != ISD::FP_EXTEND)
     return SDValue();
 
   SDValue RHSExtOp = RHS->getOperand(0);

From 7b339274f93d05055d75983d1cf2eb617d0b3ab2 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Mon, 22 Sep 2025 10:15:46 +0000
Subject: [PATCH 03/14] Address review comments

Corrected LangRef typos, improved const
comparisons for fadd, and add direct tests.
---
 llvm/docs/LangRef.rst                         |  6 +-
 llvm/include/llvm/IR/Intrinsics.td            |  4 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 +++---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  8 ++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  3 -
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  1 +
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 66 +++++++++++++++++++
 7 files changed, 90 insertions(+), 18 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 9376e3fb326f5..27038602a0c88 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20789,8 +20789,8 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
-      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.add.nxv4f32.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
 
 Overview:
 """""""""
@@ -20810,7 +20810,7 @@ of the result's type, while maintaining the same element type.
 Semantics:
 """"""""""
 
-Other than the reduction operator (e.g. add) the way in which the concatenated
+Other than the reduction operator (e.g. fadd) the way in which the concatenated
 arguments is reduced is entirely unspecified. By their nature these intrinsics
 are not expected to be useful in isolation but instead implement the first phase
 of an overall reduction operation.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 124b4347d706e..188b059cf90a0 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2798,8 +2798,8 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
                                                            IntrSpeculatable]>;
 
 def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
-                                                                        [llvm_anyfloat_ty, llvm_anyfloat_ty],
-                                                                        [IntrNoMem]>;
+                                                           [llvm_anyfloat_ty, llvm_anyfloat_ty],
+                                                           [IntrNoMem]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 14cae1644f249..c0a4a63480471 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13033,11 +13033,12 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   APInt C;
+  ConstantFPSDNode *CFP;
   if (!(Op1->getOpcode() == ISD::MUL &&
         ISD::isConstantSplatVector(Op2.getNode(), C) && C.isOne()) &&
       !(Op1->getOpcode() == ISD::FMUL &&
-        ISD::isConstantSplatVector(Op2.getNode(), C) &&
-        C == APFloat(1.0f).bitcastToAPInt().trunc(C.getBitWidth())))
+        (CFP = llvm::isConstOrConstSplatFP(Op2, false)) &&
+        CFP->isExactlyValue(1.0)))
     return SDValue();
 
   unsigned LHSOpcode = LHS->getOpcode();
@@ -13126,20 +13127,23 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDValue Op2 = N->getOperand(2);
 
   APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
-      !(ConstantOne.isOne() ||
-        ConstantOne ==
-            APFloat(1.0f).bitcastToAPInt().trunc(ConstantOne.getBitWidth())))
+  ConstantFPSDNode *C;
+  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
+        (C = llvm::isConstOrConstSplatFP(Op2, false)) &&
+        C->isExactlyValue(1.0)) &&
+      !(ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) &&
+        ConstantOne.isOne()))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
   if (!ISD::isExtOpcode(Op1Opcode))
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode != ISD::ZERO_EXTEND;
+  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
-  if (Op1IsSigned != NodeIsSigned &&
+  if (N->getOpcode() != ISD::PARTIAL_REDUCE_FMLA &&
+      Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 04a37d5854163..523d4beff338e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12089,8 +12089,12 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   }
   SDValue Input = MulLHS;
   APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) ||
-      !(ConstantOne.isOne() || ConstantOne == APFloat(1.0f).bitcastToAPInt()))
+  ConstantFPSDNode *C;
+  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
+        (C = llvm::isConstOrConstSplatFP(MulRHS, false)) &&
+        C->isExactlyValue(1.0)) &&
+      !(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
+        ConstantOne.isOne()))
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
 
   unsigned Stride = AccVT.getVectorMinNumElements();
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index b620b59d24dfd..3b268dcbca600 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4254,9 +4254,6 @@ defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
-def : Pat<(nxv4f32 (partial_reduce_fmla nxv4f32:$Acc, nxv8f16:$LHS, nxv8f16:$RHS)),
-          (FDOT_ZZZ_S $Acc, $LHS, $RHS)>;
-
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
 defm BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslt>;
 defm BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb_lane>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1664f4ad0c8fa..ad446ceb24386 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9474,6 +9474,7 @@ multiclass sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty,
                          string asm, ValueType InVT, SDPatternOperator op> {
   def NAME : sve_float_dot<bf, o2, dst_ty, src_ty, asm>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv4f32, partial_reduce_fmla, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
 }
 
 multiclass sve_fp8_dot<bit bf, ZPRRegOp dstrc, string asm, ValueType vt,
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 5bb1fae43392f..69c0b68f23f78 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -64,3 +64,69 @@ entry:
   %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
   ret <4 x float> %partial.reduce
 }
+
+define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  ret <8 x half> %partial.reduce
+}
+
+define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  ret <4 x float> %partial.reduce
+}
+
+define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  ret <2 x double> %partial.reduce
+}
+
+define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  ret <vscale x 8 x half> %partial.reduce
+}
+
+define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
+; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  ret <vscale x 2 x double> %partial.reduce
+}

From 56b3a3f465f804655ca1045b093c24be200a8f49 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Tue, 23 Sep 2025 09:13:39 +0000
Subject: [PATCH 04/14] Require reassoc

---
 llvm/lib/IR/Verifier.cpp                 |  6 ++++++
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll | 18 +++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7917712846990..622b8f790180a 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6581,6 +6581,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::vector_partial_reduce_fadd: {
+    Check(Call.hasAllowReassoc(),
+          "vector_partial_reduce_fadd requires reassociation to be allowed.");
+    // Fall through to perform the same verification checks as for integers.
+    [[fallthrough]];
+  }
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 69c0b68f23f78..aa2184ab6e65e 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -10,7 +10,7 @@ entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
   %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
-  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -40,7 +40,7 @@ entry:
   %a.wide = fpext <16 x half> %a to <16 x float>
   %b.wide = fpext <16 x half> %b to <16 x float>
   %mult = fmul <16 x float> %a.wide, %b.wide
-  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  %partial.reduce = call reassoc <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
   store <8 x float> %partial.reduce, ptr %accptr
   ret void
 }
@@ -61,7 +61,7 @@ entry:
   %a.wide = fpext <8 x half> %a to <8 x float>
   %b.wide = fpext <8 x half> %b to <8 x float>
   %mult = fmul <8 x float> %a.wide, %b.wide
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
   ret <4 x float> %partial.reduce
 }
 
@@ -72,7 +72,7 @@ define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
 ; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  %partial.reduce = call reassoc <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
   ret <8 x half> %partial.reduce
 }
 
@@ -83,7 +83,7 @@ define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
   ret <4 x float> %partial.reduce
 }
 
@@ -94,7 +94,7 @@ define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
 ; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  %partial.reduce = call reassoc <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
   ret <2 x double> %partial.reduce
 }
 
@@ -105,7 +105,7 @@ define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc,
 ; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  %partial.reduce = call reassoc <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
   ret <vscale x 8 x half> %partial.reduce
 }
 
@@ -116,7 +116,7 @@ define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %ac
 ; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -127,6 +127,6 @@ define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double>
 ; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  %partial.reduce = call reassoc <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
   ret <vscale x 2 x double> %partial.reduce
 }

From 45a27d3c014be8a96e575f5cbd9156ab7d486fb0 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 26 Sep 2025 09:55:22 +0000
Subject: [PATCH 05/14] Revert "Require reassoc"

This reverts commit 319852132602f685aea6228f10418370fd530aa7.
---
 llvm/lib/IR/Verifier.cpp                 |  6 ------
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll | 18 +++++++++---------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 622b8f790180a..7917712846990 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6581,12 +6581,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
-  case Intrinsic::vector_partial_reduce_fadd: {
-    Check(Call.hasAllowReassoc(),
-          "vector_partial_reduce_fadd requires reassociation to be allowed.");
-    // Fall through to perform the same verification checks as for integers.
-    [[fallthrough]];
-  }
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index aa2184ab6e65e..69c0b68f23f78 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -10,7 +10,7 @@ entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
   %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
-  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -40,7 +40,7 @@ entry:
   %a.wide = fpext <16 x half> %a to <16 x float>
   %b.wide = fpext <16 x half> %b to <16 x float>
   %mult = fmul <16 x float> %a.wide, %b.wide
-  %partial.reduce = call reassoc <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
   store <8 x float> %partial.reduce, ptr %accptr
   ret void
 }
@@ -61,7 +61,7 @@ entry:
   %a.wide = fpext <8 x half> %a to <8 x float>
   %b.wide = fpext <8 x half> %b to <8 x float>
   %mult = fmul <8 x float> %a.wide, %b.wide
-  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
   ret <4 x float> %partial.reduce
 }
 
@@ -72,7 +72,7 @@ define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
 ; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
   ret <8 x half> %partial.reduce
 }
 
@@ -83,7 +83,7 @@ define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
   ret <4 x float> %partial.reduce
 }
 
@@ -94,7 +94,7 @@ define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
 ; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
   ret <2 x double> %partial.reduce
 }
 
@@ -105,7 +105,7 @@ define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc,
 ; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
   ret <vscale x 8 x half> %partial.reduce
 }
 
@@ -116,7 +116,7 @@ define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %ac
 ; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -127,6 +127,6 @@ define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double>
 ; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
   ret <vscale x 2 x double> %partial.reduce
 }

From dde874da4d84ee5f7c1abfa44011635c031d4563 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Wed, 8 Oct 2025 10:38:25 +0000
Subject: [PATCH 06/14] Re-add Verifier case, and consolidate partial reduction
 docs

---
 llvm/docs/LangRef.rst    | 64 +++++++++++++++-------------------------
 llvm/lib/IR/Verifier.cpp |  1 +
 2 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 27038602a0c88..d5100b8f6c56f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20736,8 +20736,27 @@ Note that it has the following implications:
 -  If ``%cnt`` is non-zero, the return value is non-zero as well.
 -  If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
 
+Vector Partial Reduction Intrinsics
+-----------------------------------
+
+Partial horizontal reductions of vectors can be expressed using the following intrinsics.
+Each one reduces the concatenation of the two vector arguments down to the number of elements
+of the result vector type.
+
+Other than the reduction operator (e.g. add, fadd) the way in which the concatenated
+arguments is reduced is entirely unspecified. By their nature these intrinsics
+are not expected to be useful in isolation but instead implement the first phase
+of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase to calculate the final scalar result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
 '``llvm.vector.partial.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -20750,13 +20769,6 @@ This is an overloaded intrinsic.
       declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
       declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
 
-Overview:
-"""""""""
-
-The '``llvm.vector.partial.reduce.add.*``' intrinsics reduce the
-concatenation of the two vector arguments down to the number of elements of the
-result vector type.
-
 Arguments:
 """"""""""
 
@@ -20765,21 +20777,6 @@ The first argument is an integer vector with the same type as the result.
 The second argument is a vector with a length that is a known integer multiple
 of the result's type, while maintaining the same element type.
 
-Semantics:
-""""""""""
-
-Other than the reduction operator (e.g., add) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
-
 '``llvm.vector.partial.reduce.fadd.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -20792,13 +20789,6 @@ This is an overloaded intrinsic.
       declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
       declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
 
-Overview:
-"""""""""
-
-The '``llvm.vector.partial.reduce.fadd.*``' intrinsics reduce the
-concatenation of the two vector arguments down to the number of elements of the
-result vector type.
-
 Arguments:
 """"""""""
 
@@ -20810,17 +20800,9 @@ of the result's type, while maintaining the same element type.
 Semantics:
 """"""""""
 
-Other than the reduction operator (e.g. fadd) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
+As the way in which the arguments to this floating-point intrinsic are reduced is unspecified,
+this intrinsic will reassociate floating-point values, which may result in variations to the
+results due to reordering or by lowering to different instructions.
 
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7917712846990..c0f5e011f61eb 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6581,6 +6581,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::vector_partial_reduce_fadd:
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());

From 8272c9d49cb39dab765dfae478ced62127ca3dc8 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 10 Oct 2025 13:20:42 +0000
Subject: [PATCH 07/14] Address review comments

---
 llvm/docs/LangRef.rst                         | 138 +++++++++---------
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  24 ++-
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      |  12 ++
 3 files changed, 102 insertions(+), 72 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index d5100b8f6c56f..dbb2e71942abb 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20363,6 +20363,76 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
+Vector Partial Reduction Intrinsics
+-----------------------------------
+
+Partial reductions of vectors can be expressed using the following intrinsics.
+Each one reduces the concatenation of the two vector arguments down to the
+number of elements of the result vector type.
+
+Other than the reduction operator (e.g. add, fadd) the way in which the
+concatenated arguments is reduced is entirely unspecified. By their nature these
+intrinsics are not expected to be useful in isolation but instead implement the
+first phase of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase to calculate the final scalar result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
+'``llvm.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is an integer vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is a floating-point vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+Semantics:
+""""""""""
+
+As the way in which the arguments to this floating-point intrinsic are reduced
+is unspecified, this intrinsic will assume floating-point reassociation and
+contraction, which may result in variations to the results due to reordering or
+by lowering to different instructions (including combining multiple instructions
+into a single one).
+
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -20736,74 +20806,6 @@ Note that it has the following implications:
 -  If ``%cnt`` is non-zero, the return value is non-zero as well.
 -  If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
 
-Vector Partial Reduction Intrinsics
------------------------------------
-
-Partial horizontal reductions of vectors can be expressed using the following intrinsics.
-Each one reduces the concatenation of the two vector arguments down to the number of elements
-of the result vector type.
-
-Other than the reduction operator (e.g. add, fadd) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
-
-'``llvm.vector.partial.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-This is an overloaded intrinsic.
-
-::
-
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
-
-Arguments:
-""""""""""
-
-The first argument is an integer vector with the same type as the result.
-
-The second argument is a vector with a length that is a known integer multiple
-of the result's type, while maintaining the same element type.
-
-'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-This is an overloaded intrinsic.
-
-::
-
-      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
-      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
-
-Arguments:
-""""""""""
-
-The first argument is a floating-point vector with the same type as the result.
-
-The second argument is a vector with a length that is a known integer multiple
-of the result's type, while maintaining the same element type.
-
-Semantics:
-""""""""""
-
-As the way in which the arguments to this floating-point intrinsic are reduced is unspecified,
-this intrinsic will reassociate floating-point values, which may result in variations to the
-results due to reordering or by lowering to different instructions.
-
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c0a4a63480471..b3d57367d69ff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13005,6 +13005,12 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
 //
 // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
 // -> partial_reduce_*mla(acc, x, C)
+//
+// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
+// -> partial_reduce_fmla(acc, a, b)
+//
+// partial_reduce_fmla(acc, fmul(fpext(x), splat(C)), splat(1.0))
+// -> partial_reduce_fmla(acc, x, C)
 SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDLoc DL(N);
   auto *Context = DAG.getContext();
@@ -13041,8 +13047,12 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
         CFP->isExactlyValue(1.0)))
     return SDValue();
 
+  auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
+    return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND);
+  };
+
   unsigned LHSOpcode = LHS->getOpcode();
-  if (!ISD::isExtOpcode(LHSOpcode) && LHSOpcode != ISD::FP_EXTEND)
+  if (!IsIntOrFPExtOpcode(LHSOpcode))
     return SDValue();
 
   SDValue LHSExtOp = LHS->getOperand(0);
@@ -13074,7 +13084,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   unsigned RHSOpcode = RHS->getOpcode();
-  if (!ISD::isExtOpcode(RHSOpcode) && RHSOpcode != ISD::FP_EXTEND)
+  if (!IsIntOrFPExtOpcode(RHSOpcode))
     return SDValue();
 
   SDValue RHSExtOp = RHS->getOperand(0);
@@ -13120,6 +13130,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
 // partial.reduce.sumla(acc, sext(op), splat(1))
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
+// partial.reduce.fmla(acc, fpext(op), splat(1.0))
+// -> partial.reduce.fmla(acc, op, splat(1.0))
 SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDLoc DL(N);
   SDValue Acc = N->getOperand(0);
@@ -13136,7 +13148,7 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
-  if (!ISD::isExtOpcode(Op1Opcode))
+  if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
   bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
@@ -13160,8 +13172,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
           TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
     return SDValue();
 
+  auto Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                      ? DAG.getConstantFP(1, DL, UnextOp1VT)
+                      : DAG.getConstant(1, DL, UnextOp1VT);
+
   return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
-                     DAG.getConstant(1, DL, UnextOp1VT));
+                     Constant);
 }
 
 SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 69c0b68f23f78..5e55a1869e95f 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -14,6 +14,18 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
+define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
+; CHECK-LABEL: fdot_splat_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
 define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
 ; CHECK-LABEL: fdot_wide_vl256:
 ; CHECK:       // %bb.0: // %entry

From 2486e1ecb8f8b8de5b25ba7e014a7bb1da23eb49 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Mon, 13 Oct 2025 13:24:38 +0000
Subject: [PATCH 08/14] Fixed-length SVE and fix for generating MUL
 instructions from a partial.reduce.fadd

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp    |  9 ++++++++-
 .../lib/Target/AArch64/AArch64ISelLowering.cpp |  8 ++++++++
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll       | 18 ++++--------------
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 523d4beff338e..826115cf0f7a9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12095,7 +12095,14 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
         C->isExactlyValue(1.0)) &&
       !(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
         ConstantOne.isOne()))
-    Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+    switch (N->getOpcode()) {
+    case ISD::PARTIAL_REDUCE_FMLA:
+      Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+      break;
+    default:
+      Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+      break;
+    };
 
   unsigned Stride = AccVT.getVectorMinNumElements();
   unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9597428375d55..88faad94c0e32 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2294,6 +2294,13 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
                                 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
+  if (Subtarget->hasSVE2p1()) {
+    if (VT.getVectorElementType() == MVT::f32)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
+                                MVT::getVectorVT(MVT::f16, NumElts * 2),
+                                Custom);
+  }
+
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ABDS, VT, Default);
   setOperationAction(ISD::ABDU, VT, Default);
@@ -7917,6 +7924,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     return LowerPARTIAL_REDUCE_MLA(Op, DAG);
   }
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 5e55a1869e95f..cb256851de9c7 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -29,20 +29,10 @@ entry:
 define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
 ; CHECK-LABEL: fdot_wide_vl256:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x2]
-; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
-; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
-; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
-; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
-; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
-; CHECK-NEXT:    ldr z1, [x0]
-; CHECK-NEXT:    fmul z2.s, z2.s, z3.s
-; CHECK-NEXT:    fadd z0.s, z1.s, z0.s
-; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ldr z1, [x1]
+; CHECK-NEXT:    ldr z2, [x2]
+; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
 ; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
 entry:

From 8febb11ff2ba4e9ee340f3cb3890f374e47f0335 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Mon, 13 Oct 2025 15:43:57 +0000
Subject: [PATCH 09/14] Address nits

---
 llvm/docs/LangRef.rst                         | 19 +++++++++---------
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 20 +++++++------------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  8 +++-----
 3 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index dbb2e71942abb..e5b59422c03ce 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20366,18 +20366,19 @@ The argument to this intrinsic must be a vector of floating-point values.
 Vector Partial Reduction Intrinsics
 -----------------------------------
 
-Partial reductions of vectors can be expressed using the following intrinsics.
-Each one reduces the concatenation of the two vector arguments down to the
-number of elements of the result vector type.
+Partial reductions of vectors can be expressed using the intrinsics described in
+this section. Each one reduces the concatenation of the two vector arguments
+down to the number of elements of the result vector type.
 
-Other than the reduction operator (e.g. add, fadd) the way in which the
+Other than the reduction operator (e.g. add, fadd), the way in which the
 concatenated arguments is reduced is entirely unspecified. By their nature these
-intrinsics are not expected to be useful in isolation but instead implement the
-first phase of an overall reduction operation.
+intrinsics are not expected to be useful in isolation but can instead be used to
+implement the first phase of an overall reduction operation.
 
 The typical use case is loop vectorization where reductions are split into an
 in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
+performance, and an out-of-loop phase is required to calculate the final scalar
+result.
 
 By avoiding the introduction of new ordering constraints, these intrinsics
 enhance the ability to leverage a target's accumulation instructions.
@@ -20429,9 +20430,7 @@ Semantics:
 
 As the way in which the arguments to this floating-point intrinsic are reduced
 is unspecified, this intrinsic will assume floating-point reassociation and
-contraction, which may result in variations to the results due to reordering or
-by lowering to different instructions (including combining multiple instructions
-into a single one).
+contraction, which may result in variations to the results.
 
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 826115cf0f7a9..76fea5e3d837c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12089,20 +12089,14 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   }
   SDValue Input = MulLHS;
   APInt ConstantOne;
-  ConstantFPSDNode *C;
-  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
-        (C = llvm::isConstOrConstSplatFP(MulRHS, false)) &&
-        C->isExactlyValue(1.0)) &&
-      !(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
-        ConstantOne.isOne()))
-    switch (N->getOpcode()) {
-    case ISD::PARTIAL_REDUCE_FMLA:
+  if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) {
+    ConstantFPSDNode *C = llvm::isConstOrConstSplatFP(MulRHS, false);
+    if (!(C && C->isExactlyValue(1.0)))
       Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
-      break;
-    default:
-      Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
-      break;
-    };
+  } else if (!(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
+               ConstantOne.isOne())) {
+    Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+  }
 
   unsigned Stride = AccVT.getVectorMinNumElements();
   unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 88faad94c0e32..18af71257064a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2294,11 +2294,9 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
                                 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
-  if (Subtarget->hasSVE2p1()) {
-    if (VT.getVectorElementType() == MVT::f32)
-      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
-                                MVT::getVectorVT(MVT::f16, NumElts * 2),
-                                Custom);
+  if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
+    setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
+                              MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
   }
 
   // Lower fixed length vector operations to scalable equivalents.

From 92219ab473681b073c33e4c7b49600c81cdcb00e Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Wed, 15 Oct 2025 15:40:13 +0000
Subject: [PATCH 10/14] Simplify conditionals and further refine documentation

---
 llvm/docs/LangRef.rst                         |  4 +-
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |  4 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++--
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  5 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  6 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 29 ++++---
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 84 +++++++++++++++----
 7 files changed, 100 insertions(+), 50 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index e5b59422c03ce..915d3ae078c2c 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20430,7 +20430,9 @@ Semantics:
 
 As the way in which the arguments to this floating-point intrinsic are reduced
 is unspecified, this intrinsic will assume floating-point reassociation and
-contraction, which may result in variations to the results.
+contraction can be leveraged to implement the reduction, which may result in
+variations to the results due to reordering or by lowering to different
+instructions (including combining multiple instructions into a single one).
 
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 1759463ea7965..cd466dceb900f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1949,6 +1949,10 @@ LLVM_ABI bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false);
 /// be zero.
 LLVM_ABI bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false);
 
+/// Return true if the value is a constant floating-point value, or a splatted
+/// vector of a constant floating-point value, of 1.0 (with no undefs).
+LLVM_ABI bool isOneOrOneSplatFP(SDValue V, bool AllowUndefs = false);
+
 /// Return true if the value is a constant -1 integer or a splatted vector of a
 /// constant -1 integer (with no undefs).
 /// Does not permit build vector implicit truncation.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b3d57367d69ff..c9eabd80450e3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13038,13 +13038,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
     Opc = ISD::MUL;
   }
 
-  APInt C;
-  ConstantFPSDNode *CFP;
-  if (!(Op1->getOpcode() == ISD::MUL &&
-        ISD::isConstantSplatVector(Op2.getNode(), C) && C.isOne()) &&
-      !(Op1->getOpcode() == ISD::FMUL &&
-        (CFP = llvm::isConstOrConstSplatFP(Op2, false)) &&
-        CFP->isExactlyValue(1.0)))
+  if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) &&
+      !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2)))
     return SDValue();
 
   auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
@@ -13060,6 +13055,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 
   // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
   // -> partial_reduce_*mla(acc, x, C)
+  APInt C;
   if (ISD::isConstantSplatVector(RHS.getNode(), C)) {
     // TODO: Make use of partial_reduce_sumla here
     APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits());
@@ -13138,13 +13134,9 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
 
-  APInt ConstantOne;
-  ConstantFPSDNode *C;
   if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
-        (C = llvm::isConstOrConstSplatFP(Op2, false)) &&
-        C->isExactlyValue(1.0)) &&
-      !(ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) &&
-        ConstantOne.isOne()))
+        llvm::isOneOrOneSplatFP(Op2)) &&
+      !llvm::isOneOrOneSplat(Op2))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1a45cbac6f622..468180e50e451 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -13058,6 +13058,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
   return C && C->isOne();
 }
 
+bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) {
+  ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs);
+  return C && C->isExactlyValue(1.0);
+}
+
 bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
   N = peekThroughBitcasts(N);
   unsigned BitWidth = N.getScalarValueSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 413f2e59c42fa..741274e97d18e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8138,16 +8138,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::vector_partial_reduce_fadd: {
-    if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      visitTargetIntrinsic(I, Intrinsic);
-      return;
-    }
     SDValue Acc = getValue(I.getOperand(0));
     SDValue Input = getValue(I.getOperand(1));
     setValue(&I,
              DAG.getNode(ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
                          Input,
-                         DAG.getConstantFP(1.0f, sdl, Input.getValueType())));
+                         DAG.getConstantFP(1.0, sdl, Input.getValueType())));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 76fea5e3d837c..b51d6649af2ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12074,27 +12074,30 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
       EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
                        MulOpVT.getVectorElementCount());
 
-  unsigned ExtOpcLHS =
-      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
-      : N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA ? ISD::ZERO_EXTEND
-                                                   : ISD::SIGN_EXTEND;
-  unsigned ExtOpcRHS =
-      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
-      : N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA ? ISD::SIGN_EXTEND
-                                                   : ISD::ZERO_EXTEND;
+  unsigned ExtOpcLHS, ExtOpcRHS;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::PARTIAL_REDUCE_UMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_SMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_FMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND;
+    break;
+  }
 
   if (ExtMulOpVT != MulOpVT) {
     MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
     MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS);
   }
   SDValue Input = MulLHS;
-  APInt ConstantOne;
   if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) {
-    ConstantFPSDNode *C = llvm::isConstOrConstSplatFP(MulRHS, false);
-    if (!(C && C->isExactlyValue(1.0)))
+    if (!llvm::isOneOrOneSplatFP(MulRHS))
       Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
-  } else if (!(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
-               ConstantOne.isOne())) {
+  } else if (!llvm::isOneOrOneSplat(MulRHS)) {
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index cb256851de9c7..258a372dfdbb2 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -1,11 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
 
 define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; CHECK-LABEL: fdot_wide_vl128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE2-LABEL: fdot_wide_vl128:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z3.s, z1.h
+; SVE2-NEXT:    uunpklo z4.s, z2.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    uunpkhi z2.s, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fcvt z4.s, p0/m, z4.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fmul z3.s, z3.s, z4.s
+; SVE2-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z3.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_vl128:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
 entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
@@ -15,11 +33,22 @@ entry:
 }
 
 define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
-; CHECK-LABEL: fdot_splat_vl128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov z2.h, #1.00000000
-; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE2-LABEL: fdot_splat_vl128:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z2.s, z1.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_splat_vl128:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fmov z2.h, #1.00000000
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
 entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
@@ -27,14 +56,33 @@ entry:
 }
 
 define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
-; CHECK-LABEL: fdot_wide_vl256:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    ldr z1, [x1]
-; CHECK-NEXT:    ldr z2, [x2]
-; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
-; CHECK-NEXT:    str z0, [x0]
-; CHECK-NEXT:    ret
+; SVE2-LABEL: fdot_wide_vl256:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, z0.s, z1.s
+; SVE2-NEXT:    ldr z1, [x0]
+; SVE2-NEXT:    fmul z2.s, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, z1.s, z0.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
+; SVE2-NEXT:    str z0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_vl256:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ldr z0, [x0]
+; SVE2P1-NEXT:    ldr z1, [x1]
+; SVE2P1-NEXT:    ldr z2, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    str z0, [x0]
+; SVE2P1-NEXT:    ret
 entry:
   %acc = load <8 x float>, ptr %accptr
   %a = load <16 x half>, ptr %aptr

From 5c09a462267b79937c50fcae3e55139ae6ad1734 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Wed, 15 Oct 2025 15:58:57 +0000
Subject: [PATCH 11/14] auto -> SDValue

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c9eabd80450e3..f0118be32c2ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13164,9 +13164,9 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
           TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
     return SDValue();
 
-  auto Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
-                      ? DAG.getConstantFP(1, DL, UnextOp1VT)
-                      : DAG.getConstant(1, DL, UnextOp1VT);
+  SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                         ? DAG.getConstantFP(1, DL, UnextOp1VT)
+                         : DAG.getConstant(1, DL, UnextOp1VT);
 
   return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
                      Constant);

From e67a59306fe640b88cad5e0ed18f6c75ffeadb3f Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 17 Oct 2025 14:03:29 +0000
Subject: [PATCH 12/14] Use PatFrag rather than Pat, move fixed-length SVE
 tests, and test GlobalISel

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  12 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  10 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   7 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   1 -
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 119 ++-------
 .../AArch64/sve2p1-fixed-length-fdot.ll       | 230 ++++++++++++++++++
 6 files changed, 258 insertions(+), 121 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f0118be32c2ed..62f402193b6fd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13008,9 +13008,6 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
 //
 // partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
 // -> partial_reduce_fmla(acc, a, b)
-//
-// partial_reduce_fmla(acc, fmul(fpext(x), splat(C)), splat(1.0))
-// -> partial_reduce_fmla(acc, x, C)
 SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDLoc DL(N);
   auto *Context = DAG.getContext();
@@ -13134,20 +13131,17 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
 
-  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
-        llvm::isOneOrOneSplatFP(Op2)) &&
-      !llvm::isOneOrOneSplat(Op2))
+  if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
   if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
+  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
-  if (N->getOpcode() != ISD::PARTIAL_REDUCE_FMLA &&
-      Op1IsSigned != NodeIsSigned &&
+  if (Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 18af71257064a..91d6786ed3158 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1921,12 +1921,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
       setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
     }
-  }
 
-  // Handle floating-point partial reduction
-  if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
-    static const unsigned FMLAOps[] = {ISD::PARTIAL_REDUCE_FMLA};
-    setPartialReduceMLAAction(FMLAOps, MVT::nxv4f32, MVT::nxv8f16, Legal);
+    // Handle floating-point partial reduction
+    if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
+                                MVT::nxv8f16, Legal);
+    }
   }
 
   // Handle non-aliasing elements mask
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..e1f43867bbe5b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                                node:$Zm)
                                ]>;
 
+def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                            [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm),
+                             (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm)
+                            ]>;
+
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3>
@@ -4251,7 +4256,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 let Predicates = [HasSVE2p1_or_SME2] in {
 defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 
-defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
+defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index ad446ceb24386..1664f4ad0c8fa 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9474,7 +9474,6 @@ multiclass sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty,
                          string asm, ValueType InVT, SDPatternOperator op> {
   def NAME : sve_float_dot<bf, o2, dst_ty, src_ty, asm>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Pat<nxv4f32, partial_reduce_fmla, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
 }
 
 multiclass sve_fp8_dot<bit bf, ZPRRegOp dstrc, string asm, ValueType vt,
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 258a372dfdbb2..c055940768dfa 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -1,9 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
 
-define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; SVE2-LABEL: fdot_wide_vl128:
+define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; SVE2-LABEL: fdot_wide_nxv4f32:
 ; SVE2:       // %bb.0: // %entry
 ; SVE2-NEXT:    uunpklo z3.s, z1.h
 ; SVE2-NEXT:    uunpklo z4.s, z2.h
@@ -20,7 +22,7 @@ define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale
 ; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
 ; SVE2-NEXT:    ret
 ;
-; SVE2P1-LABEL: fdot_wide_vl128:
+; SVE2P1-LABEL: fdot_wide_nxv4f32:
 ; SVE2P1:       // %bb.0: // %entry
 ; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
 ; SVE2P1-NEXT:    ret
@@ -32,8 +34,8 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
-define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
-; SVE2-LABEL: fdot_splat_vl128:
+define <vscale x 4 x float> @fdot_splat_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
+; SVE2-LABEL: fdot_splat_nxv4f32:
 ; SVE2:       // %bb.0: // %entry
 ; SVE2-NEXT:    uunpklo z2.s, z1.h
 ; SVE2-NEXT:    ptrue p0.s
@@ -44,7 +46,7 @@ define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale
 ; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
 ; SVE2-NEXT:    ret
 ;
-; SVE2P1-LABEL: fdot_splat_vl128:
+; SVE2P1-LABEL: fdot_splat_nxv4f32:
 ; SVE2P1:       // %bb.0: // %entry
 ; SVE2P1-NEXT:    fmov z2.h, #1.00000000
 ; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
@@ -55,101 +57,8 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
-define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
-; SVE2-LABEL: fdot_wide_vl256:
-; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.s
-; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
-; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
-; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
-; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
-; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
-; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
-; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
-; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
-; SVE2-NEXT:    fmul z0.s, z0.s, z1.s
-; SVE2-NEXT:    ldr z1, [x0]
-; SVE2-NEXT:    fmul z2.s, z2.s, z3.s
-; SVE2-NEXT:    fadd z0.s, z1.s, z0.s
-; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
-; SVE2-NEXT:    str z0, [x0]
-; SVE2-NEXT:    ret
-;
-; SVE2P1-LABEL: fdot_wide_vl256:
-; SVE2P1:       // %bb.0: // %entry
-; SVE2P1-NEXT:    ldr z0, [x0]
-; SVE2P1-NEXT:    ldr z1, [x1]
-; SVE2P1-NEXT:    ldr z2, [x2]
-; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
-; SVE2P1-NEXT:    str z0, [x0]
-; SVE2P1-NEXT:    ret
-entry:
-  %acc = load <8 x float>, ptr %accptr
-  %a = load <16 x half>, ptr %aptr
-  %b = load <16 x half>, ptr %bptr
-  %a.wide = fpext <16 x half> %a to <16 x float>
-  %b.wide = fpext <16 x half> %b to <16 x float>
-  %mult = fmul <16 x float> %a.wide, %b.wide
-  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
-  store <8 x float> %partial.reduce, ptr %accptr
-  ret void
-}
-
-define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
-; CHECK-LABEL: fixed_fdot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NEXT:    fcvtl v4.4s, v2.4h
-; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
-; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
-entry:
-  %a.wide = fpext <8 x half> %a to <8 x float>
-  %b.wide = fpext <8 x half> %b to <8 x float>
-  %mult = fmul <8 x float> %a.wide, %b.wide
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
-  ret <4 x float> %partial.reduce
-}
-
-define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
-; CHECK-LABEL: partial_reduce_half:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
-entry:
-  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
-  ret <8 x half> %partial.reduce
-}
-
-define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
-; CHECK-LABEL: partial_reduce_float:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
-  ret <4 x float> %partial.reduce
-}
-
-define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
-; CHECK-LABEL: partial_reduce_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
-entry:
-  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
-  ret <2 x double> %partial.reduce
-}
-
-define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
-; CHECK-LABEL: partial_reduce_half_vl128:
+define <vscale x 8 x half> @partial_reduce_nxv8f16(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
+; CHECK-LABEL: partial_reduce_nxv8f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
 ; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
@@ -159,8 +68,8 @@ entry:
   ret <vscale x 8 x half> %partial.reduce
 }
 
-define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
-; CHECK-LABEL: partial_reduce_float_vl128:
+define <vscale x 4 x float> @partial_reduce_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
+; CHECK-LABEL: partial_reduce_nxv4f32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
 ; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
@@ -170,8 +79,8 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
-define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
-; CHECK-LABEL: partial_reduce_double_vl128:
+define <vscale x 2 x double> @partial_reduce_nxv2f64(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
+; CHECK-LABEL: partial_reduce_nxv2f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
 ; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
new file mode 100644
index 0000000000000..b07b571413881
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) {
+; SVE2-LABEL: fdot_wide_v8f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl8
+; SVE2-NEXT:    mov x8, #8 // =0x8
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v8f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl8
+; SVE2P1-NEXT:    ptrue p1.h, vl16
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <8 x float>, ptr %accptr
+  %a = load <16 x half>, ptr %aptr
+  %b = load <16 x half>, ptr %bptr
+  %a.wide = fpext <16 x half> %a to <16 x float>
+  %b.wide = fpext <16 x half> %b to <16 x float>
+  %mult = fmul <16 x float> %a.wide, %b.wide
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  store <8 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v16f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(4,0) {
+; SVE2-LABEL: fdot_wide_v16f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl16
+; SVE2-NEXT:    mov x8, #16 // =0x10
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v16f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl16
+; SVE2P1-NEXT:    ptrue p1.h, vl32
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <16 x float>, ptr %accptr
+  %a = load <32 x half>, ptr %aptr
+  %b = load <32 x half>, ptr %bptr
+  %a.wide = fpext <32 x half> %a to <32 x float>
+  %b.wide = fpext <32 x half> %b to <32 x float>
+  %mult = fmul <32 x float> %a.wide, %b.wide
+  %partial.reduce = call <16 x float> @llvm.vector.partial.reduce.fadd(<16 x float> %acc, <32 x float> %mult)
+  store <16 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v32f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(8,0) {
+; SVE2-LABEL: fdot_wide_v32f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl32
+; SVE2-NEXT:    mov x8, #32 // =0x20
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v32f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl32
+; SVE2P1-NEXT:    ptrue p1.h, vl64
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <32 x float>, ptr %accptr
+  %a = load <64 x half>, ptr %aptr
+  %b = load <64 x half>, ptr %bptr
+  %a.wide = fpext <64 x half> %a to <64 x float>
+  %b.wide = fpext <64 x half> %b to <64 x float>
+  %mult = fmul <64 x float> %a.wide, %b.wide
+  %partial.reduce = call <32 x float> @llvm.vector.partial.reduce.fadd(<32 x float> %acc, <64 x float> %mult)
+  store <32 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v64f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(16,0) {
+; SVE2-LABEL: fdot_wide_v64f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl64
+; SVE2-NEXT:    mov x8, #64 // =0x40
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v64f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl64
+; SVE2P1-NEXT:    ptrue p1.h, vl128
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <64 x float>, ptr %accptr
+  %a = load <128 x half>, ptr %aptr
+  %b = load <128 x half>, ptr %bptr
+  %a.wide = fpext <128 x half> %a to <128 x float>
+  %b.wide = fpext <128 x half> %b to <128 x float>
+  %mult = fmul <128 x float> %a.wide, %b.wide
+  %partial.reduce = call <64 x float> @llvm.vector.partial.reduce.fadd(<64 x float> %acc, <128 x float> %mult)
+  store <64 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: fixed_fdot_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <8 x half> %a to <8 x float>
+  %b.wide = fpext <8 x half> %b to <8 x float>
+  %mult = fmul <8 x float> %a.wide, %b.wide
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  ret <4 x float> %partial.reduce
+}
+
+define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  ret <8 x half> %partial.reduce
+}
+
+define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  ret <4 x float> %partial.reduce
+}
+
+define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  ret <2 x double> %partial.reduce
+}

From 5e2d16cb0273bcefef412590320f62158271b7c2 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 17 Oct 2025 14:37:13 +0000
Subject: [PATCH 13/14] Fix formatting

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp         | 3 ++-
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 7 +++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 62f402193b6fd..e239848c4bee9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13138,7 +13138,8 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
+  bool Op1IsSigned =
+      Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
   if (Op1IsSigned != NodeIsSigned &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 741274e97d18e..8348fc7ca08c9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8140,10 +8140,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::vector_partial_reduce_fadd: {
     SDValue Acc = getValue(I.getOperand(0));
     SDValue Input = getValue(I.getOperand(1));
-    setValue(&I,
-             DAG.getNode(ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
-                         Input,
-                         DAG.getConstantFP(1.0, sdl, Input.getValueType())));
+    setValue(&I, DAG.getNode(
+                     ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
+                     Input, DAG.getConstantFP(1.0, sdl, Input.getValueType())));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {

From 5b1aa0d418ddb02967bd90143f23987b781effb5 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Mon, 3 Nov 2025 15:32:10 +0000
Subject: [PATCH 14/14] Remove duplicate `-mtriple`s and redundant fixed-length
 GlobalISel tests.

---
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll              | 10 ++++++----
 llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index c055940768dfa..9dbe096ebdb57 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+target triple = "aarch64-linux-gnu"
 
 define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; SVE2-LABEL: fdot_wide_nxv4f32:
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
index b07b571413881..89216ce2cb72b 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+target triple = "aarch64-linux-gnu"
 
 define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) {
 ; SVE2-LABEL: fdot_wide_v8f32: