[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

alexey-bataev · 2025-07-08T18:37:55Z

Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.

Created using spr 1.3.5

llvmbot · 2025-07-08T18:38:32Z

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.

Patch is 36.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147583.diff

8 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+116-53)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll (+2-6)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll (+16-14)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll (+2-8)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll (+6-18)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll (+11-27)
(modified) llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll (+80-48)
(modified) llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll (+3-3)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c93af749507f8..bec393051a257 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21676,58 +21676,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   return Changed;
 }
 
-bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
-  if (!I)
-    return false;
-
-  if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
-    return false;
-
-  Value *P = I->getParent();
-
-  // Vectorize in current basic block only.
-  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
-  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
-  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
-      R.isDeleted(Op0) || R.isDeleted(Op1))
-    return false;
-
-  // First collect all possible candidates
-  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
-  Candidates.emplace_back(Op0, Op1);
-
-  auto *A = dyn_cast<BinaryOperator>(Op0);
-  auto *B = dyn_cast<BinaryOperator>(Op1);
-  // Try to skip B.
-  if (A && B && B->hasOneUse()) {
-    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
-    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (B0 && B0->getParent() == P && !R.isDeleted(B0))
-      Candidates.emplace_back(A, B0);
-    if (B1 && B1->getParent() == P && !R.isDeleted(B1))
-      Candidates.emplace_back(A, B1);
-  }
-  // Try to skip A.
-  if (B && A && A->hasOneUse()) {
-    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
-    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (A0 && A0->getParent() == P && !R.isDeleted(A0))
-      Candidates.emplace_back(A0, B);
-    if (A1 && A1->getParent() == P && !R.isDeleted(A1))
-      Candidates.emplace_back(A1, B);
-  }
-
-  if (Candidates.size() == 1)
-    return tryToVectorizeList({Op0, Op1}, R);
-
-  // We have multiple options. Try to pick the single best.
-  std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
-  if (!BestCandidate)
-    return false;
-  return tryToVectorizeList(
-      {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
-}
-
 namespace {
 
 /// Model horizontal reductions.
@@ -21770,6 +21718,8 @@ class HorizontalReduction {
   /// Checks if the optimization of original scalar identity operations on
   /// matched horizontal reductions is enabled and allowed.
   bool IsSupportedHorRdxIdentityOp = false;
+  /// The minimum number of the reduced values.
+  const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
   /// Contains vector values for reduction including their scale factor and
   /// signedness.
   SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -22068,6 +22018,24 @@ class HorizontalReduction {
 
 public:
   HorizontalReduction() = default;
+  HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
+      : ReductionRoot(I), ReductionLimit(2) {
+    RdxKind = HorizontalReduction::getRdxKind(I);
+    ReductionOps.emplace_back().push_back(I);
+    ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
+    for (Value *V : Ops)
+      ReducedValsToOps[V].push_back(I);
+  }
+
+  bool matchReductionForOperands() const {
+    // Analyze "regular" integer/FP types for reductions - no target-specific
+    // types or pointers.
+    assert(ReductionRoot && "Reduction root is not set!");
+    if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot)))
+      return false;
+
+    return true;
+  }
 
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22235,7 +22203,6 @@ class HorizontalReduction {
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
                      const TargetLibraryInfo &TLI, AssumptionCache *AC) {
-    const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
     constexpr unsigned RegMaxNumber = 4;
     constexpr unsigned RedValsMaxNumber = 128;
     // If there are a sufficient number of reduction values, reduce
@@ -23740,6 +23707,102 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   return Res;
 }
 
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+  if (!I)
+    return false;
+
+  if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
+    return false;
+
+  Value *P = I->getParent();
+
+  // Vectorize in current basic block only.
+  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
+      R.isDeleted(Op0) || R.isDeleted(Op1))
+    return false;
+
+  // First collect all possible candidates
+  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+  Candidates.emplace_back(Op0, Op1);
+
+  auto *A = dyn_cast<BinaryOperator>(Op0);
+  auto *B = dyn_cast<BinaryOperator>(Op1);
+  // Try to skip B.
+  if (A && B && B->hasOneUse()) {
+    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (B0 && B0->getParent() == P && !R.isDeleted(B0))
+      Candidates.emplace_back(A, B0);
+    if (B1 && B1->getParent() == P && !R.isDeleted(B1))
+      Candidates.emplace_back(A, B1);
+  }
+  // Try to skip A.
+  if (B && A && A->hasOneUse()) {
+    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (A0 && A0->getParent() == P && !R.isDeleted(A0))
+      Candidates.emplace_back(A0, B);
+    if (A1 && A1->getParent() == P && !R.isDeleted(A1))
+      Candidates.emplace_back(A1, B);
+  }
+
+  auto TryToReduce = [this, &R, &TTI=*TTI](Instruction *Inst, ArrayRef<Value *> Ops) {
+    if (!isReductionCandidate(Inst))
+      return false;
+    Type *Ty = Inst->getType();
+    if (!isValidElementType(Ty) || Ty->isPointerTy())
+      return false;
+    HorizontalReduction HorRdx(Inst, Ops);
+    if (!HorRdx.matchReductionForOperands())
+      return false;
+    // Check the cost of operations.
+    VectorType *VecTy= getWidenedType(Ty, Ops.size());
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    InstructionCost ScalarCost =
+        TTI.getScalarizationOverhead(
+            VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
+            /*Extract=*/true, CostKind) +
+        TTI.getInstructionCost(Inst, CostKind);
+    InstructionCost RedCost;
+    switch (::getRdxKind(Inst)) {
+    case RecurKind::Add:
+    case RecurKind::Mul:
+    case RecurKind::Or:
+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul: {
+      FastMathFlags FMF;
+      if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
+        FMF = FPCI->getFastMathFlags();
+      RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
+                                               CostKind);
+      break;
+    }
+    default:
+      return false;
+    }
+    if (RedCost >= ScalarCost)
+      return false;
+
+    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+  };
+  if (Candidates.size() == 1)
+    return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
+
+  // We have multiple options. Try to pick the single best.
+  std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
+  if (!BestCandidate)
+    return false;
+  return TryToReduce(I, {Candidates[*BestCandidate].first,
+                         Candidates[*BestCandidate].second}) ||
+         tryToVectorizeList({Candidates[*BestCandidate].first,
+                             Candidates[*BestCandidate].second},
+                            R);
+}
+
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
                                                  BasicBlock *BB, BoUpSLP &R) {
   SmallVector<WeakTrackingVH> PostponedInsts;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 19b6d82818532..442769937ac12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 677d52bf3b4c3..8d4a1152fe4da 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -3,13 +3,19 @@
 ; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
 
 define half @reduce_fast_half2(<2 x half> %vec2) {
-; CHECK-LABEL: define half @reduce_fast_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; CHECK-NEXT:    ret half [[ADD1]]
+; NOFP16-LABEL: define half @reduce_fast_half2(
+; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOFP16-NEXT:  [[ENTRY:.*:]]
+; NOFP16-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; NOFP16-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; NOFP16-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; NOFP16-NEXT:    ret half [[ADD1]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half2(
+; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; FULLFP16-NEXT:  [[ENTRY:.*:]]
+; FULLFP16-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
+; FULLFP16-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <2 x half> %vec2, i64 0
@@ -20,7 +26,7 @@ entry:
 
 define half @reduce_half2(<2 x half> %vec2) {
 ; CHECK-LABEL: define half @reduce_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
 ; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
 ; CHECK-LABEL: define float @reduce_fast_float2(
 ; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
 ; CHECK-NEXT:    ret float [[ADD1]]
 ;
 entry:
@@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
 ; CHECK-LABEL: define double @reduce_fast_double2(
 ; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
 ; CHECK-NEXT:    ret double [[ADD1]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 03f67ecb3e695..543f19225d74f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,9 +216,7 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
 define float @slp_not_profitable_in_loop(float %x, ptr %A) {
 ; CHECK-LABEL: @slp_not_profitable_in_loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
-; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[L_3:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -226,12 +224,8 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
 ; CHECK-NEXT:    [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
+; CHECK-NEXT:    [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]])
 ; CHECK-NEXT:    [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 10
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 651f565412830..1116f8a7fbe27 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,33 +141,21 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
 ; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
 ; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
 ; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
 ; POWEROF2-NEXT:    br label [[TMP8:%.*]]
-; POWEROF2:       7:
+; POWEROF2:       6:
 ; POWEROF2-NEXT:    br label [[TMP8]]
-; POWEROF2:       8:
-; POWEROF2-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
-; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2:       7:
+; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
 ; POWEROF2-NEXT:    br label [[TMP11:%.*]]
-; POWEROF2:       11:
+; POWEROF2:       9:
 ; POWEROF2-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
-; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
-; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
-; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
-; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; POWEROF2-NEXT:    [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
-; POWEROF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
-; POWEROF2-NEXT:    [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
-; POWEROF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; POWEROF2-NEXT:    [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT:    [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]])
+; POWEROF2-NEXT:    [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]])
 ; POWEROF2-NEXT:    [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
 ; POWEROF2-NEXT:    [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
 ; POWEROF2-NEXT:    ret ptr null
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 481d586e6658a..27de36e601512 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
 ;
 ; POW2-ONLY-LABEL: @dot_product_i32(
 ; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
 ; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
 ; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]...
[truncated]

github-actions · 2025-07-08T18:40:53Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5

preames

I'm supportive of the direction, and the RISCV changes look fine.

RKSimon · 2025-07-09T09:28:39Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

-    return false;
-  return tryToVectorizeList(
-      {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
-}


please can you move this as a NFC and the rebase? I can't tell if there's any changes to the implementation

Created using spr 1.3.5

RKSimon

LGTM with a a couple of minors

RKSimon · 2025-07-09T15:29:20Z

llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll

-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-AVX


Technically corei7 is not AVX - its a SSE4.2 cpu, but its costs are close enough that its fine to keep it on CHECK-AVX for now.

I can rename to SSE/NONSSE instead

thanks but no need - but I might alter the tested cpus in a future patch though

RKSimon · 2025-07-09T15:30:17Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul: {


Can't the integer min/max kinds be included here?

We do not vectorize it here for now, just binaryoperations and compares

Created using spr 1.3.5

ronlieb · 2025-07-10T12:10:18Z

This patch seems to be breaking 4 spec accel2023 tests 404, 456, 457, 470
0017: minU: 0.000000e+00 maxU: 8.777816e-03
minU: 0.000000e+00 maxU: 8.768163e-03
^
will try to create small reproducer

…vectorizing operands (#147583)" This reverts commit ac4a38e. This breaks the RVV builders (MicroBenchmarks/ImageProcessing/Blur/blur.test and MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test from llvm-test-suite) and reportedly SPEC Accel2023 <#147583 (comment)>.

asb · 2025-07-10T13:58:34Z

I've pushed a revert for the time being because this breaks the RVV builders too (MicroBenchmarks/ImageProcessing/Blur/blur.test and MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test from llvm-test-suite)

…r op, when vectorizing operands (#147583)" This reverts commit ac4a38e. This breaks the RVV builders (MicroBenchmarks/ImageProcessing/Blur/blur.test and MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test from llvm-test-suite) and reportedly SPEC Accel2023 <llvm/llvm-project#147583 (comment)>.

asb · 2025-07-10T14:49:04Z

The changed translation unit from the Blur microbenchmark is quite small, so isolating that:

tc.ll:

; ModuleID = 'gaussianBlurKernel.bc'
source_filename = "./MicroBenchmarks/ImageProcessing/Blur/gaussianBlurKernel.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

; Function Attrs: nounwind uwtable vscale_range(2,1024)
define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef %inputImage, ptr noundef %outputImage) #0 {
entry:
  %height.addr = alloca i32, align 4
  %width.addr = alloca i32, align 4
  %inputImage.addr = alloca ptr, align 8
  %outputImage.addr = alloca ptr, align 8
  %sigma = alloca float, align 4
  %s = alloca float, align 4
  %offset = alloca i32, align 4
  %sum = alloca float, align 4
  %gaussianFilter = alloca [9 x [9 x float]], align 4
  %x = alloca i32, align 4
  %cleanup.dest.slot = alloca i32, align 4
  %y = alloca i32, align 4
  %sum_in_current_frame = alloca float, align 4
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %k = alloca i32, align 4
  %l = alloca i32, align 4
  store i32 %height, ptr %height.addr, align 4, !tbaa !9
  store i32 %width, ptr %width.addr, align 4, !tbaa !9
  store ptr %inputImage, ptr %inputImage.addr, align 8, !tbaa !13
  store ptr %outputImage, ptr %outputImage.addr, align 8, !tbaa !13
  %0 = load i32, ptr %height.addr, align 4, !tbaa !9
  %1 = zext i32 %0 to i64
  %2 = load i32, ptr %width.addr, align 4, !tbaa !9
  %3 = zext i32 %2 to i64
  %4 = load i32, ptr %height.addr, align 4, !tbaa !9
  %5 = zext i32 %4 to i64
  %6 = load i32, ptr %width.addr, align 4, !tbaa !9
  %7 = zext i32 %6 to i64
  call void @llvm.lifetime.start.p0(i64 4, ptr %sigma) #4
  store float 9.000000e+00, ptr %sigma, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %s) #4
  %8 = load float, ptr %sigma, align 4, !tbaa !16
  %conv = fpext float %8 to double
  %mul = fmul double 2.000000e+00, %conv
  %9 = load float, ptr %sigma, align 4, !tbaa !16
  %conv1 = fpext float %9 to double
  %mul2 = fmul double %mul, %conv1
  %conv3 = fptrunc double %mul2 to float
  store float %conv3, ptr %s, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %offset) #4
  store i32 4, ptr %offset, align 4, !tbaa !9
  call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #4
  store float 0.000000e+00, ptr %sum, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 324, ptr %gaussianFilter) #4
  call void @llvm.memset.p0.i64(ptr align 4 %gaussianFilter, i8 0, i64 324, i1 false)
  call void @llvm.lifetime.start.p0(i64 4, ptr %x) #4
  %10 = load i32, ptr %offset, align 4, !tbaa !9
  %mul4 = mul nsw i32 -1, %10
  store i32 %mul4, ptr %x, align 4, !tbaa !9
  br label %for.cond

for.cond:                                         ; preds = %for.inc31, %entry
  %11 = load i32, ptr %x, align 4, !tbaa !9
  %12 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp = icmp sle i32 %11, %12
  br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.cond
  store i32 2, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %x) #4
  br label %for.end33

for.body:                                         ; preds = %for.cond
  call void @llvm.lifetime.start.p0(i64 4, ptr %y) #4
  %13 = load i32, ptr %offset, align 4, !tbaa !9
  %mul6 = mul nsw i32 -1, %13
  store i32 %mul6, ptr %y, align 4, !tbaa !9
  br label %for.cond7

for.cond7:                                        ; preds = %for.inc, %for.body
  %14 = load i32, ptr %y, align 4, !tbaa !9
  %15 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp8 = icmp sle i32 %14, %15
  br i1 %cmp8, label %for.body11, label %for.cond.cleanup10

for.cond.cleanup10:                               ; preds = %for.cond7
  store i32 5, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %y) #4
  br label %for.end

for.body11:                                       ; preds = %for.cond7
  %16 = load i32, ptr %x, align 4, !tbaa !9
  %17 = load i32, ptr %x, align 4, !tbaa !9
  %mul12 = mul nsw i32 %16, %17
  %18 = load i32, ptr %y, align 4, !tbaa !9
  %19 = load i32, ptr %y, align 4, !tbaa !9
  %mul13 = mul nsw i32 %18, %19
  %add = add nsw i32 %mul12, %mul13
  %sub = sub nsw i32 0, %add
  %conv14 = sitofp i32 %sub to float
  %20 = load float, ptr %s, align 4, !tbaa !16
  %div = fdiv float %conv14, %20
  %conv15 = fpext float %div to double
  %call = call double @exp(double noundef %conv15) #4, !tbaa !9
  %21 = load float, ptr %s, align 4, !tbaa !16
  %conv16 = fpext float %21 to double
  %mul17 = fmul double 0x400921FB54442D18, %conv16
  %div18 = fdiv double %call, %mul17
  %conv19 = fptrunc double %div18 to float
  %22 = load i32, ptr %x, align 4, !tbaa !9
  %23 = load i32, ptr %offset, align 4, !tbaa !9
  %add20 = add nsw i32 %22, %23
  %idxprom = sext i32 %add20 to i64
  %arrayidx = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom
  %24 = load i32, ptr %y, align 4, !tbaa !9
  %25 = load i32, ptr %offset, align 4, !tbaa !9
  %add21 = add nsw i32 %24, %25
  %idxprom22 = sext i32 %add21 to i64
  %arrayidx23 = getelementptr inbounds [9 x float], ptr %arrayidx, i64 0, i64 %idxprom22
  store float %conv19, ptr %arrayidx23, align 4, !tbaa !16
  %26 = load i32, ptr %x, align 4, !tbaa !9
  %27 = load i32, ptr %offset, align 4, !tbaa !9
  %add24 = add nsw i32 %26, %27
  %idxprom25 = sext i32 %add24 to i64
  %arrayidx26 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom25
  %28 = load i32, ptr %y, align 4, !tbaa !9
  %29 = load i32, ptr %offset, align 4, !tbaa !9
  %add27 = add nsw i32 %28, %29
  %idxprom28 = sext i32 %add27 to i64
  %arrayidx29 = getelementptr inbounds [9 x float], ptr %arrayidx26, i64 0, i64 %idxprom28
  %30 = load float, ptr %arrayidx29, align 4, !tbaa !16
  %31 = load float, ptr %sum, align 4, !tbaa !16
  %add30 = fadd float %31, %30
  store float %add30, ptr %sum, align 4, !tbaa !16
  br label %for.inc

for.inc:                                          ; preds = %for.body11
  %32 = load i32, ptr %y, align 4, !tbaa !9
  %inc = add nsw i32 %32, 1
  store i32 %inc, ptr %y, align 4, !tbaa !9
  br label %for.cond7, !llvm.loop !18

for.end:                                          ; preds = %for.cond.cleanup10
  br label %for.inc31

for.inc31:                                        ; preds = %for.end
  %33 = load i32, ptr %x, align 4, !tbaa !9
  %inc32 = add nsw i32 %33, 1
  store i32 %inc32, ptr %x, align 4, !tbaa !9
  br label %for.cond, !llvm.loop !20

for.end33:                                        ; preds = %for.cond.cleanup
  call void @llvm.lifetime.start.p0(i64 4, ptr %sum_in_current_frame) #4
  store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #4
  %34 = load i32, ptr %offset, align 4, !tbaa !9
  store i32 %34, ptr %i, align 4, !tbaa !9
  br label %for.cond34

for.cond34:                                       ; preds = %for.inc88, %for.end33
  %35 = load i32, ptr %i, align 4, !tbaa !9
  %36 = load i32, ptr %height.addr, align 4, !tbaa !9
  %37 = load i32, ptr %offset, align 4, !tbaa !9
  %sub35 = sub nsw i32 %36, %37
  %cmp36 = icmp slt i32 %35, %sub35
  br i1 %cmp36, label %for.body39, label %for.cond.cleanup38

for.cond.cleanup38:                               ; preds = %for.cond34
  store i32 8, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #4
  br label %for.end90

for.body39:                                       ; preds = %for.cond34
  call void @llvm.lifetime.start.p0(i64 4, ptr %j) #4
  %38 = load i32, ptr %offset, align 4, !tbaa !9
  store i32 %38, ptr %j, align 4, !tbaa !9
  br label %for.cond40

for.cond40:                                       ; preds = %for.inc85, %for.body39
  %39 = load i32, ptr %j, align 4, !tbaa !9
  %40 = load i32, ptr %width.addr, align 4, !tbaa !9
  %41 = load i32, ptr %offset, align 4, !tbaa !9
  %sub41 = sub nsw i32 %40, %41
  %cmp42 = icmp slt i32 %39, %sub41
  br i1 %cmp42, label %for.body45, label %for.cond.cleanup44

for.cond.cleanup44:                               ; preds = %for.cond40
  store i32 11, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %j) #4
  br label %for.end87

for.body45:                                       ; preds = %for.cond40
  store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %k) #4
  %42 = load i32, ptr %offset, align 4, !tbaa !9
  %mul46 = mul nsw i32 -1, %42
  store i32 %mul46, ptr %k, align 4, !tbaa !9
  br label %for.cond47

for.cond47:                                       ; preds = %for.inc76, %for.body45
  %43 = load i32, ptr %k, align 4, !tbaa !9
  %44 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp48 = icmp sle i32 %43, %44
  br i1 %cmp48, label %for.body51, label %for.cond.cleanup50

for.cond.cleanup50:                               ; preds = %for.cond47
  store i32 14, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %k) #4
  br label %for.end78

for.body51:                                       ; preds = %for.cond47
  call void @llvm.lifetime.start.p0(i64 4, ptr %l) #4
  %45 = load i32, ptr %offset, align 4, !tbaa !9
  %mul52 = mul nsw i32 -1, %45
  store i32 %mul52, ptr %l, align 4, !tbaa !9
  br label %for.cond53

for.cond53:                                       ; preds = %for.inc73, %for.body51
  %46 = load i32, ptr %l, align 4, !tbaa !9
  %47 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp54 = icmp sle i32 %46, %47
  br i1 %cmp54, label %for.body57, label %for.cond.cleanup56

for.cond.cleanup56:                               ; preds = %for.cond53
  store i32 17, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %l) #4
  br label %for.end75

for.body57:                                       ; preds = %for.cond53
  %48 = load ptr, ptr %inputImage.addr, align 8, !tbaa !13
  %49 = load i32, ptr %i, align 4, !tbaa !9
  %50 = load i32, ptr %k, align 4, !tbaa !9
  %add58 = add nsw i32 %49, %50
  %idxprom59 = sext i32 %add58 to i64
  %51 = mul nsw i64 %idxprom59, %3
  %arrayidx60 = getelementptr inbounds i32, ptr %48, i64 %51
  %52 = load i32, ptr %j, align 4, !tbaa !9
  %53 = load i32, ptr %l, align 4, !tbaa !9
  %add61 = add nsw i32 %52, %53
  %idxprom62 = sext i32 %add61 to i64
  %arrayidx63 = getelementptr inbounds i32, ptr %arrayidx60, i64 %idxprom62
  %54 = load i32, ptr %arrayidx63, align 4, !tbaa !9
  %conv64 = sitofp i32 %54 to float
  %55 = load i32, ptr %k, align 4, !tbaa !9
  %56 = load i32, ptr %offset, align 4, !tbaa !9
  %add65 = add nsw i32 %55, %56
  %idxprom66 = sext i32 %add65 to i64
  %arrayidx67 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom66
  %57 = load i32, ptr %l, align 4, !tbaa !9
  %58 = load i32, ptr %offset, align 4, !tbaa !9
  %add68 = add nsw i32 %57, %58
  %idxprom69 = sext i32 %add68 to i64
  %arrayidx70 = getelementptr inbounds [9 x float], ptr %arrayidx67, i64 0, i64 %idxprom69
  %59 = load float, ptr %arrayidx70, align 4, !tbaa !16
  %mul71 = fmul float %conv64, %59
  %60 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
  %add72 = fadd float %60, %mul71
  store float %add72, ptr %sum_in_current_frame, align 4, !tbaa !16
  br label %for.inc73

for.inc73:                                        ; preds = %for.body57
  %61 = load i32, ptr %l, align 4, !tbaa !9
  %inc74 = add nsw i32 %61, 1
  store i32 %inc74, ptr %l, align 4, !tbaa !9
  br label %for.cond53, !llvm.loop !21

for.end75:                                        ; preds = %for.cond.cleanup56
  br label %for.inc76

for.inc76:                                        ; preds = %for.end75
  %62 = load i32, ptr %k, align 4, !tbaa !9
  %inc77 = add nsw i32 %62, 1
  store i32 %inc77, ptr %k, align 4, !tbaa !9
  br label %for.cond47, !llvm.loop !22

for.end78:                                        ; preds = %for.cond.cleanup50
  %63 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
  %64 = load float, ptr %sum, align 4, !tbaa !16
  %div79 = fdiv float %63, %64
  %conv80 = fptosi float %div79 to i32
  %65 = load ptr, ptr %outputImage.addr, align 8, !tbaa !13
  %66 = load i32, ptr %i, align 4, !tbaa !9
  %idxprom81 = sext i32 %66 to i64
  %67 = mul nsw i64 %idxprom81, %7
  %arrayidx82 = getelementptr inbounds i32, ptr %65, i64 %67
  %68 = load i32, ptr %j, align 4, !tbaa !9
  %idxprom83 = sext i32 %68 to i64
  %arrayidx84 = getelementptr inbounds i32, ptr %arrayidx82, i64 %idxprom83
  store i32 %conv80, ptr %arrayidx84, align 4, !tbaa !9
  br label %for.inc85

for.inc85:                                        ; preds = %for.end78
  %69 = load i32, ptr %j, align 4, !tbaa !9
  %inc86 = add nsw i32 %69, 1
  store i32 %inc86, ptr %j, align 4, !tbaa !9
  br label %for.cond40, !llvm.loop !23

for.end87:                                        ; preds = %for.cond.cleanup44
  br label %for.inc88

for.inc88:                                        ; preds = %for.end87
  %70 = load i32, ptr %i, align 4, !tbaa !9
  %inc89 = add nsw i32 %70, 1
  store i32 %inc89, ptr %i, align 4, !tbaa !9
  br label %for.cond34, !llvm.loop !24

for.end90:                                        ; preds = %for.cond.cleanup38
  call void @llvm.lifetime.end.p0(i64 4, ptr %sum_in_current_frame) #4
  call void @llvm.lifetime.end.p0(i64 324, ptr %gaussianFilter) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %offset) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %s) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %sigma) #4
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr captures(none)) #1

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg) #2

; Function Attrs: nounwind
declare double @exp(double noundef) #3

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr captures(none)) #1

attributes #0 = { nounwind uwtable vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
attributes #3 = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcd1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_supm1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 0}
!8 = !{!"clang version 21.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"int", !11, i64 0}
!11 = !{!"omnipotent char", !12, i64 0}
!12 = !{!"Simple C/C++ TBAA"}
!13 = !{!14, !14, i64 0}
!14 = !{!"p1 int", !15, i64 0}
!15 = !{!"any pointer", !11, i64 0}
!16 = !{!17, !17, i64 0}
!17 = !{!"float", !11, i64 0}
!18 = distinct !{!18, !19}
!19 = !{!"llvm.loop.mustprogress"}
!20 = distinct !{!20, !19}
!21 = distinct !{!21, !19}
!22 = distinct !{!22, !19}
!23 = distinct !{!23, !19}
!24 = distinct !{!24, !19}

And build with:

clang --target=riscv64-linux-gnu -march=rva23u64 -O3 -ffp-contract=off tc.ll -c -S -o -

Looking at the dump of -mllvm -debug -mllvm -print-after-all for LLVM with/without this patch, the amount of logic dropped by SLP after this patch seems suspect (diff below is of 'good' vs 'bad')

 ; *** IR Dump After SLPVectorizerPass on gaussianBlurKernel ***
 ; Function Attrs: nofree norecurse nounwind memory(argmem: readwrite, errnomem: write) uwtable vscale_range(2,1024)
 define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef readonly captures(none) %inputImage, ptr noundef writeonly captures(none) %outputImage) local_unnamed_addr #0 {
 entry:
   %gaussianFilter = alloca [9 x [9 x float]], align 4
-  call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #4
+  call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #5
   call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(324) %gaussianFilter, i8 0, i64 324, i1 false)
   br label %for.cond7.preheader
 
@@ -41577,7 +40757,7 @@
   %conv14 = sitofp i32 %3 to float
   %div = fdiv float %conv14, 1.620000e+02
   %conv15 = fpext float %div to double
-  %call = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+  %call = tail call double @exp(double noundef %conv15) #5, !tbaa !9
   %div18 = fdiv double %call, 0x407FCF0216A64912
   %conv19 = fptrunc double %div18 to float
   %arrayidx23 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 0
@@ -41588,7 +40768,7 @@
   %conv14.1 = sitofp i32 %5 to float
   %div.1 = fdiv float %conv14.1, 1.620000e+02
   %conv15.1 = fpext float %div.1 to double
-  %call.1 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+  %call.1 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
   %div18.1 = fdiv double %call.1, 0x407FCF0216A64912
   %conv19.1 = fptrunc double %div18.1 to float
   %arrayidx23.1 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 1
@@ -41599,7 +40779,7 @@
   %conv14.2 = sitofp i32 %7 to float
   %div.2 = fdiv float %conv14.2, 1.620000e+02
   %conv15.2 = fpext float %div.2 to double
-  %call.2 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+  %call.2 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
   %div18.2 = fdiv double %call.2, 0x407FCF0216A64912
   %conv19.2 = fptrunc double %div18.2 to float
   %arrayidx23.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 2
@@ -41610,7 +40790,7 @@
   %conv14.3 = sitofp i32 %9 to float
   %div.3 = fdiv float %conv14.3, 1.620000e+02
   %conv15.3 = fpext float %div.3 to double
-  %call.3 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+  %call.3 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
   %div18.3 = fdiv double %call.3, 0x407FCF0216A64912
   %conv19.3 = fptrunc double %div18.3 to float
   %arrayidx23.3 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 3
@@ -41621,31 +40801,31 @@
   %conv14.4 = sitofp i32 %11 to float
   %div.4 = fdiv float %conv14.4, 1.620000e+02
   %conv15.4 = fpext float %div.4 to double
-  %call.4 = tail call double @exp(double noundef %conv15.4) #4, !tbaa !9
+  %call.4 = tail call double @exp(double noundef %conv15.4) #5, !tbaa !9
   %div18.4 = fdiv double %call.4, 0x407FCF0216A64912
   %conv19.4 = fptrunc double %div18.4 to float
   %arrayidx23.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 4
   store float %conv19.4, ptr %arrayidx23.4, align 4, !tbaa !13
   %add30.4 = fadd float %add30.3, %conv19.4
-  %call.5 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+  %call.5 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
   %div18.5 = fdiv double %call.5, 0x407FCF0216A64912
   %conv19.5 = fptrunc double %div18.5 to float
   %arrayidx23.5 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 5
   store float %conv19.5, ptr %arrayidx23.5, align 4, !tbaa !13
   %add30.5 = fadd float %add30.4, %conv19.5
-  %call.6 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+  %call.6 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
   %div18.6 = fdiv double %call.6, 0x407FCF0216A64912
   %conv19.6 = fptrunc double %div18.6 to float
   %arrayidx23.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 6
   store float %conv19.6, ptr %arrayidx23.6, align 4, !tbaa !13
   %add30.6 = fadd float %add30.5, %conv19.6
-  %call.7 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+  %call.7 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
   %div18.7 = fdiv double %call.7, 0x407FCF0216A64912
   %conv19.7 = fptrunc double %div18.7 to float
   %arrayidx23.7 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 7
   store float %conv19.7, ptr %arrayidx23.7, align 4, !tbaa !13
   %add30.7 = fadd float %add30.6, %conv19.7
-  %call.8 = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+  %call.8 = tail call double @exp(double noundef %conv15) #5, !tbaa !9
   %div18.8 = fdiv double %call.8, 0x407FCF0216A64912
   %conv19.8 = fptrunc double %div18.8 to float
   %arrayidx23.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 8
@@ -41680,74 +40860,38 @@
   %conv80.us = fptosi float %div79.us to i32
   %arrayidx84.us = getelementptr inbounds nuw i32, ptr %arrayidx82.us, i64 %indvars.iv167
   store i32 %conv80.us, ptr %arrayidx84.us, align 4, !tbaa !9
-  %exitcond169.not = icmp eq i64 %47, %wide.trip.count
+  %exitcond169.not = icmp eq i64 %26, %wide.trip.count
   br i1 %exitcond169.not, label %for.cond40.for.cond.cleanup44_crit_edge.us, label %for.cond47.preheader.us, !llvm.loop !17
 
 for.cond53.preheader.us:                          ; preds = %for.cond47.preheader.us, %for.cond53.preheader.us
   %indvars.iv162 = phi i64 [ -4, %for.cond47.preheader.us ], [ %indvars.iv.next163, %for.cond53.preheader.us ]
-  %sum_in_current_frame.0144.us = phi float [ 0.000000e+00, %for.cond47.preheader.us ], [ %add72.us.8, %for.cond53.preheader.us ]
   %14 = add nsw i64 %indvars.iv162, %indvars.iv170
   %15 = mul nuw nsw i64 %14, %12
   %arrayidx60.us = getelementptr inbounds i32, ptr %inputImage, i64 %15
   %16 = add nsw i64 %indvars.iv162, 4
-  %17 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us = getelementptr i8, ptr %17, i64 -16
-  %arrayidx70.us = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 0
-  %18 = load <2 x i32>, ptr %arrayidx63.us, align 4, !tbaa !9
+  %17 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+  %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %17, i64 8
+  %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
+  %18 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
   %19 = sitofp <2 x i32> %18 to <2 x float>
-  %20 = load <2 x float>, ptr %arrayidx70.us, align 4, !tbaa !13
+  %20 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
   %21 = fmul <2 x float> %20, %19
-  %22 = extractelement <2 x float> %21, i32 0
-  %add72.us = fadd float %sum_in_current_frame.0144.us, %22
-  %23 = extractelement <2 x float> %21, i32 1
-  %add72.us.1 = fadd float %add72.us, %23
-  %24 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us.2 = getelementptr i8, ptr %24, i64 -8
-  %arrayidx70.us.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 2
-  %25 = load <2 x i32>, ptr %arrayidx63.us.2, align 4, !tbaa !9
-  %26 = sitofp <2 x i32> %25 to <2 x float>
-  %27 = load <2 x float>, ptr %arrayidx70.us.2, align 4, !tbaa !13
-  %28 = fmul <2 x float> %27, %26
-  %29 = extractelement <2 x float> %28, i32 0
-  %add72.us.2 = fadd float %add72.us.1, %29
-  %30 = extractelement <2 x float> %28, i32 1
-  %add72.us.3 = fadd float %add72.us.2, %30
-  %arrayidx63.us.4 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx70.us.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 4
-  %31 = load <2 x i32>, ptr %arrayidx63.us.4, align 4, !tbaa !9
-  %32 = sitofp <2 x i32> %31 to <2 x float>
-  %33 = load <2 x float>, ptr %arrayidx70.us.4, align 4, !tbaa !13
-  %34 = fmul <2 x float> %33, %32
-  %35 = extractelement <2 x float> %34, i32 0
-  %add72.us.4 = fadd float %add72.us.3, %35
-  %36 = extractelement <2 x float> %34, i32 1
-  %add72.us.5 = fadd float %add72.us.4, %36
-  %37 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %37, i64 8
-  %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
-  %38 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
-  %39 = sitofp <2 x i32> %38 to <2 x float>
-  %40 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
-  %41 = fmul <2 x float> %40, %39
-  %42 = extractelement <2 x float> %41, i32 0
-  %add72.us.6 = fadd float %add72.us.5, %42
-  %43 = extractelement <2 x float> %41, i32 1
-  %add72.us.7 = fadd float %add72.us.6, %43
-  %44 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %44, i64 16
-  %45 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
-  %conv64.us.8 = sitofp i32 %45 to float
+  %22 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %21)
+  %23 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+  %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %23, i64 16
+  %24 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
+  %conv64.us.8 = sitofp i32 %24 to float
   %arrayidx70.us.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 8
-  %46 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
-  %mul71.us.8 = fmul float %46, %conv64.us.8
-  %add72.us.8 = fadd float %add72.us.7, %mul71.us.8
+  %25 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
+  %mul71.us.8 = fmul float %25, %conv64.us.8
+  %add72.us.8 = fadd float %22, %mul71.us.8
   %indvars.iv.next163 = add nsw i64 %indvars.iv162, 1
   %exitcond166.not = icmp eq i64 %indvars.iv.next163, 5
   br i1 %exitcond166.not, label %for.cond.cleanup50.us, label %for.cond53.preheader.us, !llvm.loop !18
 
 for.cond47.preheader.us:                          ; preds = %for.cond40.preheader.us, %for.cond.cleanup50.us
-  %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %47, %for.cond.cleanup50.us ]
-  %47 = add nuw nsw i64 %indvars.iv167, 1
+  %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %26, %for.cond.cleanup50.us ]
+  %26 = add nuw nsw i64 %indvars.iv167, 1
   br label %for.cond53.preheader.us
 
 for.cond40.for.cond.cleanup44_crit_edge.us:       ; preds = %for.cond.cleanup50.us
@@ -41756,14 +40900,14 @@
   br i1 %exitcond173.not, label %for.cond.cleanup38, label %for.cond40.preheader.us, !llvm.loop !19
 
 for.cond.cleanup38:                               ; preds = %for.cond40.for.cond.cleanup44_crit_edge.us, %for.cond34.preheader
-  call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #4
+  call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #5
   ret void
 }

…vectorizing operands (llvm#147583)" breaks spec accel2023 404 456 457 470 This reverts commit ac4a38e.

…ing operands (#147583) Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.

preames · 2025-07-10T20:52:02Z

@alexey-bataev I see that you relanded this change, but don't see any description here or in the commit as to what changed. Can you briefly describe?

alexey-bataev · 2025-07-10T20:57:23Z

@alexey-bataev I see that you relanded this change, but don't see any description here or in the commit as to what changed. Can you briefly describe?

Prevented the emission for the case, when it tried to emit reduction for non-immediate operands.
I.e. if you have something like (a+b) + (c+d) and the vectorizer prefers to vectorize pair (a,c), it should not be emitted as a reduction, still need to emit as extractelements

This adds a custom lowering for v2f16 vecreduce.fadd to scalarize as opposed to padding with zeroes. This allows it to generate the more efficient faddp. Helps with #147583.

[𝘀𝗽𝗿] initial version

84ff08f

Created using spr 1.3.5

llvmbot added vectorizers llvm:transforms labels Jul 8, 2025

alexey-bataev requested review from preames, hiraditya and RKSimon July 8, 2025 18:38

Fix formatting

85f3eab

Created using spr 1.3.5

preames reviewed Jul 8, 2025

View reviewed changes

RKSimon reviewed Jul 9, 2025

View reviewed changes

Rebase

0c24c4b

Created using spr 1.3.5

RKSimon approved these changes Jul 9, 2025

View reviewed changes

Rebase, supported ordered reductions

0cc841a

Created using spr 1.3.5

davemgreen mentioned this pull request Jul 9, 2025

[AArch64] Scalarize v2f16 vecreduce.fadd #147783

Merged

alexey-bataev merged commit ac4a38e into main Jul 9, 2025
9 checks passed

alexey-bataev deleted the users/alexey-bataev/spr/slp-emit-reduction-instead-of-2-extracts-scalar-op-when-vectorizing-operands branch July 9, 2025 23:52

ronlieb self-requested a review July 10, 2025 13:07

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Jul 10, 2025

Revert "[SLP] Emit reduction instead of 2 extracts + scalar op, when …

79b29d1

…vectorizing operands (llvm#147583)" breaks spec accel2023 404 456 457 470 This reverts commit ac4a38e.

[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

Uh oh!

Conversation

alexey-bataev commented Jul 8, 2025

Uh oh!

llvmbot commented Jul 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Jul 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

ronlieb commented Jul 10, 2025

Uh oh!

asb commented Jul 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

asb commented Jul 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

preames commented Jul 10, 2025

Uh oh!

alexey-bataev commented Jul 10, 2025

Uh oh!

Uh oh!

llvmbot commented Jul 8, 2025 •

edited

Loading

github-actions bot commented Jul 8, 2025 •

edited

Loading

asb commented Jul 10, 2025 •

edited

Loading

asb commented Jul 10, 2025 •

edited

Loading