-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesAdded emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable. Patch is 36.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147583.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c93af749507f8..bec393051a257 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21676,58 +21676,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
return Changed;
}
-bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
- if (!I)
- return false;
-
- if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
- return false;
-
- Value *P = I->getParent();
-
- // Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
- if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
- R.isDeleted(Op0) || R.isDeleted(Op1))
- return false;
-
- // First collect all possible candidates
- SmallVector<std::pair<Value *, Value *>, 4> Candidates;
- Candidates.emplace_back(Op0, Op1);
-
- auto *A = dyn_cast<BinaryOperator>(Op0);
- auto *B = dyn_cast<BinaryOperator>(Op1);
- // Try to skip B.
- if (A && B && B->hasOneUse()) {
- auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && !R.isDeleted(B0))
- Candidates.emplace_back(A, B0);
- if (B1 && B1->getParent() == P && !R.isDeleted(B1))
- Candidates.emplace_back(A, B1);
- }
- // Try to skip A.
- if (B && A && A->hasOneUse()) {
- auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && !R.isDeleted(A0))
- Candidates.emplace_back(A0, B);
- if (A1 && A1->getParent() == P && !R.isDeleted(A1))
- Candidates.emplace_back(A1, B);
- }
-
- if (Candidates.size() == 1)
- return tryToVectorizeList({Op0, Op1}, R);
-
- // We have multiple options. Try to pick the single best.
- std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
- if (!BestCandidate)
- return false;
- return tryToVectorizeList(
- {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
-}
-
namespace {
/// Model horizontal reductions.
@@ -21770,6 +21718,8 @@ class HorizontalReduction {
/// Checks if the optimization of original scalar identity operations on
/// matched horizontal reductions is enabled and allowed.
bool IsSupportedHorRdxIdentityOp = false;
+ /// The minimum number of the reduced values.
+ const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
/// Contains vector values for reduction including their scale factor and
/// signedness.
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -22068,6 +22018,24 @@ class HorizontalReduction {
public:
HorizontalReduction() = default;
+ HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
+ : ReductionRoot(I), ReductionLimit(2) {
+ RdxKind = HorizontalReduction::getRdxKind(I);
+ ReductionOps.emplace_back().push_back(I);
+ ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
+ for (Value *V : Ops)
+ ReducedValsToOps[V].push_back(I);
+ }
+
+ bool matchReductionForOperands() const {
+ // Analyze "regular" integer/FP types for reductions - no target-specific
+ // types or pointers.
+ assert(ReductionRoot && "Reduction root is not set!");
+ if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot)))
+ return false;
+
+ return true;
+ }
/// Try to find a reduction tree.
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22235,7 +22203,6 @@ class HorizontalReduction {
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
- const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -23740,6 +23707,102 @@ bool SLPVectorizerPass::vectorizeHorReduction(
return Res;
}
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
+ return false;
+
+ Value *P = I->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
+ R.isDeleted(Op0) || R.isDeleted(Op1))
+ return false;
+
+ // First collect all possible candidates
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Op0, Op1);
+
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
+ // Try to skip B.
+ if (A && B && B->hasOneUse()) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && !R.isDeleted(B0))
+ Candidates.emplace_back(A, B0);
+ if (B1 && B1->getParent() == P && !R.isDeleted(B1))
+ Candidates.emplace_back(A, B1);
+ }
+ // Try to skip A.
+ if (B && A && A->hasOneUse()) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && !R.isDeleted(A0))
+ Candidates.emplace_back(A0, B);
+ if (A1 && A1->getParent() == P && !R.isDeleted(A1))
+ Candidates.emplace_back(A1, B);
+ }
+
+ auto TryToReduce = [this, &R, &TTI=*TTI](Instruction *Inst, ArrayRef<Value *> Ops) {
+ if (!isReductionCandidate(Inst))
+ return false;
+ Type *Ty = Inst->getType();
+ if (!isValidElementType(Ty) || Ty->isPointerTy())
+ return false;
+ HorizontalReduction HorRdx(Inst, Ops);
+ if (!HorRdx.matchReductionForOperands())
+ return false;
+ // Check the cost of operations.
+ VectorType *VecTy= getWidenedType(Ty, Ops.size());
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost ScalarCost =
+ TTI.getScalarizationOverhead(
+ VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
+ /*Extract=*/true, CostKind) +
+ TTI.getInstructionCost(Inst, CostKind);
+ InstructionCost RedCost;
+ switch (::getRdxKind(Inst)) {
+ case RecurKind::Add:
+ case RecurKind::Mul:
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::FAdd:
+ case RecurKind::FMul: {
+ FastMathFlags FMF;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
+ FMF = FPCI->getFastMathFlags();
+ RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
+ CostKind);
+ break;
+ }
+ default:
+ return false;
+ }
+ if (RedCost >= ScalarCost)
+ return false;
+
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+ };
+ if (Candidates.size() == 1)
+ return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
+
+ // We have multiple options. Try to pick the single best.
+ std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
+ if (!BestCandidate)
+ return false;
+ return TryToReduce(I, {Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second}) ||
+ tryToVectorizeList({Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second},
+ R);
+}
+
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
BasicBlock *BB, BoUpSLP &R) {
SmallVector<WeakTrackingVH> PostponedInsts;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 19b6d82818532..442769937ac12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
@@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 677d52bf3b4c3..8d4a1152fe4da 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -3,13 +3,19 @@
; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
define half @reduce_fast_half2(<2 x half> %vec2) {
-; CHECK-LABEL: define half @reduce_fast_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; CHECK-NEXT: ret half [[ADD1]]
+; NOFP16-LABEL: define half @reduce_fast_half2(
+; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOFP16-NEXT: [[ENTRY:.*:]]
+; NOFP16-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; NOFP16-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; NOFP16-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; NOFP16-NEXT: ret half [[ADD1]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half2(
+; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; FULLFP16-NEXT: [[ENTRY:.*:]]
+; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
+; FULLFP16-NEXT: ret half [[TMP0]]
;
entry:
%elt0 = extractelement <2 x half> %vec2, i64 0
@@ -20,7 +26,7 @@ entry:
define half @reduce_half2(<2 x half> %vec2) {
; CHECK-LABEL: define half @reduce_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
; CHECK-LABEL: define float @reduce_fast_float2(
; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
; CHECK-NEXT: ret float [[ADD1]]
;
entry:
@@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
; CHECK-LABEL: define double @reduce_fast_double2(
; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
; CHECK-NEXT: ret double [[ADD1]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 03f67ecb3e695..543f19225d74f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,9 +216,7 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
-; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
; CHECK-NEXT: br label [[LOOP:%.*]]
@@ -226,12 +224,8 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
+; CHECK-NEXT: [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]])
; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 651f565412830..1116f8a7fbe27 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,33 +141,21 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
; POWEROF2-NEXT: br label [[TMP8:%.*]]
-; POWEROF2: 7:
+; POWEROF2: 6:
; POWEROF2-NEXT: br label [[TMP8]]
-; POWEROF2: 8:
-; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
-; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2: 7:
+; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
; POWEROF2-NEXT: br label [[TMP11:%.*]]
-; POWEROF2: 11:
+; POWEROF2: 9:
; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
-; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
-; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
-; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
-; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
-; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
-; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
-; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT: [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]])
+; POWEROF2-NEXT: [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]])
; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
; POWEROF2-NEXT: ret ptr null
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 481d586e6658a..27de36e601512 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
;
; POW2-ONLY-LABEL: @dot_product_i32(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm supportive of the direction, and the RISCV changes look fine.
return false; | ||
return tryToVectorizeList( | ||
{Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please can you move this as a NFC and the rebase? I can't tell if there's any changes to the implementation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a a couple of minors
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s | ||
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s | ||
; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX | ||
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-AVX |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically corei7 is not AVX - its a SSE4.2 cpu, but its costs are close enough that its fine to keep it on CHECK-AVX for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can rename to SSE/NONSSE instead
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks but no need - but I might alter the tested cpus in a future patch though
case RecurKind::And: | ||
case RecurKind::Xor: | ||
case RecurKind::FAdd: | ||
case RecurKind::FMul: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't the integer min/max kinds be included here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do not vectorize it here for now, just binaryoperations and compares
Created using spr 1.3.5
This patch seems to be breaking 4 spec accel2023 tests 404, 456, 457, 470 |
…vectorizing operands (#147583)" This reverts commit ac4a38e. This breaks the RVV builders (MicroBenchmarks/ImageProcessing/Blur/blur.test and MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test from llvm-test-suite) and reportedly SPEC Accel2023 <#147583 (comment)>.
I've pushed a revert for the time being because this breaks the RVV builders too (MicroBenchmarks/ImageProcessing/Blur/blur.test and MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test from llvm-test-suite) |
…r op, when vectorizing operands (#147583)" This reverts commit ac4a38e. This breaks the RVV builders (MicroBenchmarks/ImageProcessing/Blur/blur.test and MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test from llvm-test-suite) and reportedly SPEC Accel2023 <llvm/llvm-project#147583 (comment)>.
The changed translation unit from the Blur microbenchmark is quite small, so isolating that: tc.ll: ; ModuleID = 'gaussianBlurKernel.bc'
source_filename = "./MicroBenchmarks/ImageProcessing/Blur/gaussianBlurKernel.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable vscale_range(2,1024)
define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef %inputImage, ptr noundef %outputImage) #0 {
entry:
%height.addr = alloca i32, align 4
%width.addr = alloca i32, align 4
%inputImage.addr = alloca ptr, align 8
%outputImage.addr = alloca ptr, align 8
%sigma = alloca float, align 4
%s = alloca float, align 4
%offset = alloca i32, align 4
%sum = alloca float, align 4
%gaussianFilter = alloca [9 x [9 x float]], align 4
%x = alloca i32, align 4
%cleanup.dest.slot = alloca i32, align 4
%y = alloca i32, align 4
%sum_in_current_frame = alloca float, align 4
%i = alloca i32, align 4
%j = alloca i32, align 4
%k = alloca i32, align 4
%l = alloca i32, align 4
store i32 %height, ptr %height.addr, align 4, !tbaa !9
store i32 %width, ptr %width.addr, align 4, !tbaa !9
store ptr %inputImage, ptr %inputImage.addr, align 8, !tbaa !13
store ptr %outputImage, ptr %outputImage.addr, align 8, !tbaa !13
%0 = load i32, ptr %height.addr, align 4, !tbaa !9
%1 = zext i32 %0 to i64
%2 = load i32, ptr %width.addr, align 4, !tbaa !9
%3 = zext i32 %2 to i64
%4 = load i32, ptr %height.addr, align 4, !tbaa !9
%5 = zext i32 %4 to i64
%6 = load i32, ptr %width.addr, align 4, !tbaa !9
%7 = zext i32 %6 to i64
call void @llvm.lifetime.start.p0(i64 4, ptr %sigma) #4
store float 9.000000e+00, ptr %sigma, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %s) #4
%8 = load float, ptr %sigma, align 4, !tbaa !16
%conv = fpext float %8 to double
%mul = fmul double 2.000000e+00, %conv
%9 = load float, ptr %sigma, align 4, !tbaa !16
%conv1 = fpext float %9 to double
%mul2 = fmul double %mul, %conv1
%conv3 = fptrunc double %mul2 to float
store float %conv3, ptr %s, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %offset) #4
store i32 4, ptr %offset, align 4, !tbaa !9
call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #4
store float 0.000000e+00, ptr %sum, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 324, ptr %gaussianFilter) #4
call void @llvm.memset.p0.i64(ptr align 4 %gaussianFilter, i8 0, i64 324, i1 false)
call void @llvm.lifetime.start.p0(i64 4, ptr %x) #4
%10 = load i32, ptr %offset, align 4, !tbaa !9
%mul4 = mul nsw i32 -1, %10
store i32 %mul4, ptr %x, align 4, !tbaa !9
br label %for.cond
for.cond: ; preds = %for.inc31, %entry
%11 = load i32, ptr %x, align 4, !tbaa !9
%12 = load i32, ptr %offset, align 4, !tbaa !9
%cmp = icmp sle i32 %11, %12
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
store i32 2, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %x) #4
br label %for.end33
for.body: ; preds = %for.cond
call void @llvm.lifetime.start.p0(i64 4, ptr %y) #4
%13 = load i32, ptr %offset, align 4, !tbaa !9
%mul6 = mul nsw i32 -1, %13
store i32 %mul6, ptr %y, align 4, !tbaa !9
br label %for.cond7
for.cond7: ; preds = %for.inc, %for.body
%14 = load i32, ptr %y, align 4, !tbaa !9
%15 = load i32, ptr %offset, align 4, !tbaa !9
%cmp8 = icmp sle i32 %14, %15
br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
for.cond.cleanup10: ; preds = %for.cond7
store i32 5, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %y) #4
br label %for.end
for.body11: ; preds = %for.cond7
%16 = load i32, ptr %x, align 4, !tbaa !9
%17 = load i32, ptr %x, align 4, !tbaa !9
%mul12 = mul nsw i32 %16, %17
%18 = load i32, ptr %y, align 4, !tbaa !9
%19 = load i32, ptr %y, align 4, !tbaa !9
%mul13 = mul nsw i32 %18, %19
%add = add nsw i32 %mul12, %mul13
%sub = sub nsw i32 0, %add
%conv14 = sitofp i32 %sub to float
%20 = load float, ptr %s, align 4, !tbaa !16
%div = fdiv float %conv14, %20
%conv15 = fpext float %div to double
%call = call double @exp(double noundef %conv15) #4, !tbaa !9
%21 = load float, ptr %s, align 4, !tbaa !16
%conv16 = fpext float %21 to double
%mul17 = fmul double 0x400921FB54442D18, %conv16
%div18 = fdiv double %call, %mul17
%conv19 = fptrunc double %div18 to float
%22 = load i32, ptr %x, align 4, !tbaa !9
%23 = load i32, ptr %offset, align 4, !tbaa !9
%add20 = add nsw i32 %22, %23
%idxprom = sext i32 %add20 to i64
%arrayidx = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom
%24 = load i32, ptr %y, align 4, !tbaa !9
%25 = load i32, ptr %offset, align 4, !tbaa !9
%add21 = add nsw i32 %24, %25
%idxprom22 = sext i32 %add21 to i64
%arrayidx23 = getelementptr inbounds [9 x float], ptr %arrayidx, i64 0, i64 %idxprom22
store float %conv19, ptr %arrayidx23, align 4, !tbaa !16
%26 = load i32, ptr %x, align 4, !tbaa !9
%27 = load i32, ptr %offset, align 4, !tbaa !9
%add24 = add nsw i32 %26, %27
%idxprom25 = sext i32 %add24 to i64
%arrayidx26 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom25
%28 = load i32, ptr %y, align 4, !tbaa !9
%29 = load i32, ptr %offset, align 4, !tbaa !9
%add27 = add nsw i32 %28, %29
%idxprom28 = sext i32 %add27 to i64
%arrayidx29 = getelementptr inbounds [9 x float], ptr %arrayidx26, i64 0, i64 %idxprom28
%30 = load float, ptr %arrayidx29, align 4, !tbaa !16
%31 = load float, ptr %sum, align 4, !tbaa !16
%add30 = fadd float %31, %30
store float %add30, ptr %sum, align 4, !tbaa !16
br label %for.inc
for.inc: ; preds = %for.body11
%32 = load i32, ptr %y, align 4, !tbaa !9
%inc = add nsw i32 %32, 1
store i32 %inc, ptr %y, align 4, !tbaa !9
br label %for.cond7, !llvm.loop !18
for.end: ; preds = %for.cond.cleanup10
br label %for.inc31
for.inc31: ; preds = %for.end
%33 = load i32, ptr %x, align 4, !tbaa !9
%inc32 = add nsw i32 %33, 1
store i32 %inc32, ptr %x, align 4, !tbaa !9
br label %for.cond, !llvm.loop !20
for.end33: ; preds = %for.cond.cleanup
call void @llvm.lifetime.start.p0(i64 4, ptr %sum_in_current_frame) #4
store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %i) #4
%34 = load i32, ptr %offset, align 4, !tbaa !9
store i32 %34, ptr %i, align 4, !tbaa !9
br label %for.cond34
for.cond34: ; preds = %for.inc88, %for.end33
%35 = load i32, ptr %i, align 4, !tbaa !9
%36 = load i32, ptr %height.addr, align 4, !tbaa !9
%37 = load i32, ptr %offset, align 4, !tbaa !9
%sub35 = sub nsw i32 %36, %37
%cmp36 = icmp slt i32 %35, %sub35
br i1 %cmp36, label %for.body39, label %for.cond.cleanup38
for.cond.cleanup38: ; preds = %for.cond34
store i32 8, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %i) #4
br label %for.end90
for.body39: ; preds = %for.cond34
call void @llvm.lifetime.start.p0(i64 4, ptr %j) #4
%38 = load i32, ptr %offset, align 4, !tbaa !9
store i32 %38, ptr %j, align 4, !tbaa !9
br label %for.cond40
for.cond40: ; preds = %for.inc85, %for.body39
%39 = load i32, ptr %j, align 4, !tbaa !9
%40 = load i32, ptr %width.addr, align 4, !tbaa !9
%41 = load i32, ptr %offset, align 4, !tbaa !9
%sub41 = sub nsw i32 %40, %41
%cmp42 = icmp slt i32 %39, %sub41
br i1 %cmp42, label %for.body45, label %for.cond.cleanup44
for.cond.cleanup44: ; preds = %for.cond40
store i32 11, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %j) #4
br label %for.end87
for.body45: ; preds = %for.cond40
store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %k) #4
%42 = load i32, ptr %offset, align 4, !tbaa !9
%mul46 = mul nsw i32 -1, %42
store i32 %mul46, ptr %k, align 4, !tbaa !9
br label %for.cond47
for.cond47: ; preds = %for.inc76, %for.body45
%43 = load i32, ptr %k, align 4, !tbaa !9
%44 = load i32, ptr %offset, align 4, !tbaa !9
%cmp48 = icmp sle i32 %43, %44
br i1 %cmp48, label %for.body51, label %for.cond.cleanup50
for.cond.cleanup50: ; preds = %for.cond47
store i32 14, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %k) #4
br label %for.end78
for.body51: ; preds = %for.cond47
call void @llvm.lifetime.start.p0(i64 4, ptr %l) #4
%45 = load i32, ptr %offset, align 4, !tbaa !9
%mul52 = mul nsw i32 -1, %45
store i32 %mul52, ptr %l, align 4, !tbaa !9
br label %for.cond53
for.cond53: ; preds = %for.inc73, %for.body51
%46 = load i32, ptr %l, align 4, !tbaa !9
%47 = load i32, ptr %offset, align 4, !tbaa !9
%cmp54 = icmp sle i32 %46, %47
br i1 %cmp54, label %for.body57, label %for.cond.cleanup56
for.cond.cleanup56: ; preds = %for.cond53
store i32 17, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %l) #4
br label %for.end75
for.body57: ; preds = %for.cond53
%48 = load ptr, ptr %inputImage.addr, align 8, !tbaa !13
%49 = load i32, ptr %i, align 4, !tbaa !9
%50 = load i32, ptr %k, align 4, !tbaa !9
%add58 = add nsw i32 %49, %50
%idxprom59 = sext i32 %add58 to i64
%51 = mul nsw i64 %idxprom59, %3
%arrayidx60 = getelementptr inbounds i32, ptr %48, i64 %51
%52 = load i32, ptr %j, align 4, !tbaa !9
%53 = load i32, ptr %l, align 4, !tbaa !9
%add61 = add nsw i32 %52, %53
%idxprom62 = sext i32 %add61 to i64
%arrayidx63 = getelementptr inbounds i32, ptr %arrayidx60, i64 %idxprom62
%54 = load i32, ptr %arrayidx63, align 4, !tbaa !9
%conv64 = sitofp i32 %54 to float
%55 = load i32, ptr %k, align 4, !tbaa !9
%56 = load i32, ptr %offset, align 4, !tbaa !9
%add65 = add nsw i32 %55, %56
%idxprom66 = sext i32 %add65 to i64
%arrayidx67 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom66
%57 = load i32, ptr %l, align 4, !tbaa !9
%58 = load i32, ptr %offset, align 4, !tbaa !9
%add68 = add nsw i32 %57, %58
%idxprom69 = sext i32 %add68 to i64
%arrayidx70 = getelementptr inbounds [9 x float], ptr %arrayidx67, i64 0, i64 %idxprom69
%59 = load float, ptr %arrayidx70, align 4, !tbaa !16
%mul71 = fmul float %conv64, %59
%60 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
%add72 = fadd float %60, %mul71
store float %add72, ptr %sum_in_current_frame, align 4, !tbaa !16
br label %for.inc73
for.inc73: ; preds = %for.body57
%61 = load i32, ptr %l, align 4, !tbaa !9
%inc74 = add nsw i32 %61, 1
store i32 %inc74, ptr %l, align 4, !tbaa !9
br label %for.cond53, !llvm.loop !21
for.end75: ; preds = %for.cond.cleanup56
br label %for.inc76
for.inc76: ; preds = %for.end75
%62 = load i32, ptr %k, align 4, !tbaa !9
%inc77 = add nsw i32 %62, 1
store i32 %inc77, ptr %k, align 4, !tbaa !9
br label %for.cond47, !llvm.loop !22
for.end78: ; preds = %for.cond.cleanup50
%63 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
%64 = load float, ptr %sum, align 4, !tbaa !16
%div79 = fdiv float %63, %64
%conv80 = fptosi float %div79 to i32
%65 = load ptr, ptr %outputImage.addr, align 8, !tbaa !13
%66 = load i32, ptr %i, align 4, !tbaa !9
%idxprom81 = sext i32 %66 to i64
%67 = mul nsw i64 %idxprom81, %7
%arrayidx82 = getelementptr inbounds i32, ptr %65, i64 %67
%68 = load i32, ptr %j, align 4, !tbaa !9
%idxprom83 = sext i32 %68 to i64
%arrayidx84 = getelementptr inbounds i32, ptr %arrayidx82, i64 %idxprom83
store i32 %conv80, ptr %arrayidx84, align 4, !tbaa !9
br label %for.inc85
for.inc85: ; preds = %for.end78
%69 = load i32, ptr %j, align 4, !tbaa !9
%inc86 = add nsw i32 %69, 1
store i32 %inc86, ptr %j, align 4, !tbaa !9
br label %for.cond40, !llvm.loop !23
for.end87: ; preds = %for.cond.cleanup44
br label %for.inc88
for.inc88: ; preds = %for.end87
%70 = load i32, ptr %i, align 4, !tbaa !9
%inc89 = add nsw i32 %70, 1
store i32 %inc89, ptr %i, align 4, !tbaa !9
br label %for.cond34, !llvm.loop !24
for.end90: ; preds = %for.cond.cleanup38
call void @llvm.lifetime.end.p0(i64 4, ptr %sum_in_current_frame) #4
call void @llvm.lifetime.end.p0(i64 324, ptr %gaussianFilter) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %offset) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %s) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %sigma) #4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr captures(none)) #1
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg) #2
; Function Attrs: nounwind
declare double @exp(double noundef) #3
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr captures(none)) #1
attributes #0 = { nounwind uwtable vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
attributes #3 = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #4 = { nounwind }
!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcd1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_supm1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 0}
!8 = !{!"clang version 21.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"int", !11, i64 0}
!11 = !{!"omnipotent char", !12, i64 0}
!12 = !{!"Simple C/C++ TBAA"}
!13 = !{!14, !14, i64 0}
!14 = !{!"p1 int", !15, i64 0}
!15 = !{!"any pointer", !11, i64 0}
!16 = !{!17, !17, i64 0}
!17 = !{!"float", !11, i64 0}
!18 = distinct !{!18, !19}
!19 = !{!"llvm.loop.mustprogress"}
!20 = distinct !{!20, !19}
!21 = distinct !{!21, !19}
!22 = distinct !{!22, !19}
!23 = distinct !{!23, !19}
!24 = distinct !{!24, !19} And build with:
Looking at the dump of ; *** IR Dump After SLPVectorizerPass on gaussianBlurKernel ***
; Function Attrs: nofree norecurse nounwind memory(argmem: readwrite, errnomem: write) uwtable vscale_range(2,1024)
define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef readonly captures(none) %inputImage, ptr noundef writeonly captures(none) %outputImage) local_unnamed_addr #0 {
entry:
%gaussianFilter = alloca [9 x [9 x float]], align 4
- call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #4
+ call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #5
call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(324) %gaussianFilter, i8 0, i64 324, i1 false)
br label %for.cond7.preheader
@@ -41577,7 +40757,7 @@
%conv14 = sitofp i32 %3 to float
%div = fdiv float %conv14, 1.620000e+02
%conv15 = fpext float %div to double
- %call = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+ %call = tail call double @exp(double noundef %conv15) #5, !tbaa !9
%div18 = fdiv double %call, 0x407FCF0216A64912
%conv19 = fptrunc double %div18 to float
%arrayidx23 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 0
@@ -41588,7 +40768,7 @@
%conv14.1 = sitofp i32 %5 to float
%div.1 = fdiv float %conv14.1, 1.620000e+02
%conv15.1 = fpext float %div.1 to double
- %call.1 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+ %call.1 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
%div18.1 = fdiv double %call.1, 0x407FCF0216A64912
%conv19.1 = fptrunc double %div18.1 to float
%arrayidx23.1 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 1
@@ -41599,7 +40779,7 @@
%conv14.2 = sitofp i32 %7 to float
%div.2 = fdiv float %conv14.2, 1.620000e+02
%conv15.2 = fpext float %div.2 to double
- %call.2 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+ %call.2 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
%div18.2 = fdiv double %call.2, 0x407FCF0216A64912
%conv19.2 = fptrunc double %div18.2 to float
%arrayidx23.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 2
@@ -41610,7 +40790,7 @@
%conv14.3 = sitofp i32 %9 to float
%div.3 = fdiv float %conv14.3, 1.620000e+02
%conv15.3 = fpext float %div.3 to double
- %call.3 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+ %call.3 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
%div18.3 = fdiv double %call.3, 0x407FCF0216A64912
%conv19.3 = fptrunc double %div18.3 to float
%arrayidx23.3 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 3
@@ -41621,31 +40801,31 @@
%conv14.4 = sitofp i32 %11 to float
%div.4 = fdiv float %conv14.4, 1.620000e+02
%conv15.4 = fpext float %div.4 to double
- %call.4 = tail call double @exp(double noundef %conv15.4) #4, !tbaa !9
+ %call.4 = tail call double @exp(double noundef %conv15.4) #5, !tbaa !9
%div18.4 = fdiv double %call.4, 0x407FCF0216A64912
%conv19.4 = fptrunc double %div18.4 to float
%arrayidx23.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 4
store float %conv19.4, ptr %arrayidx23.4, align 4, !tbaa !13
%add30.4 = fadd float %add30.3, %conv19.4
- %call.5 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+ %call.5 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
%div18.5 = fdiv double %call.5, 0x407FCF0216A64912
%conv19.5 = fptrunc double %div18.5 to float
%arrayidx23.5 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 5
store float %conv19.5, ptr %arrayidx23.5, align 4, !tbaa !13
%add30.5 = fadd float %add30.4, %conv19.5
- %call.6 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+ %call.6 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
%div18.6 = fdiv double %call.6, 0x407FCF0216A64912
%conv19.6 = fptrunc double %div18.6 to float
%arrayidx23.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 6
store float %conv19.6, ptr %arrayidx23.6, align 4, !tbaa !13
%add30.6 = fadd float %add30.5, %conv19.6
- %call.7 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+ %call.7 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
%div18.7 = fdiv double %call.7, 0x407FCF0216A64912
%conv19.7 = fptrunc double %div18.7 to float
%arrayidx23.7 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 7
store float %conv19.7, ptr %arrayidx23.7, align 4, !tbaa !13
%add30.7 = fadd float %add30.6, %conv19.7
- %call.8 = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+ %call.8 = tail call double @exp(double noundef %conv15) #5, !tbaa !9
%div18.8 = fdiv double %call.8, 0x407FCF0216A64912
%conv19.8 = fptrunc double %div18.8 to float
%arrayidx23.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 8
@@ -41680,74 +40860,38 @@
%conv80.us = fptosi float %div79.us to i32
%arrayidx84.us = getelementptr inbounds nuw i32, ptr %arrayidx82.us, i64 %indvars.iv167
store i32 %conv80.us, ptr %arrayidx84.us, align 4, !tbaa !9
- %exitcond169.not = icmp eq i64 %47, %wide.trip.count
+ %exitcond169.not = icmp eq i64 %26, %wide.trip.count
br i1 %exitcond169.not, label %for.cond40.for.cond.cleanup44_crit_edge.us, label %for.cond47.preheader.us, !llvm.loop !17
for.cond53.preheader.us: ; preds = %for.cond47.preheader.us, %for.cond53.preheader.us
%indvars.iv162 = phi i64 [ -4, %for.cond47.preheader.us ], [ %indvars.iv.next163, %for.cond53.preheader.us ]
- %sum_in_current_frame.0144.us = phi float [ 0.000000e+00, %for.cond47.preheader.us ], [ %add72.us.8, %for.cond53.preheader.us ]
%14 = add nsw i64 %indvars.iv162, %indvars.iv170
%15 = mul nuw nsw i64 %14, %12
%arrayidx60.us = getelementptr inbounds i32, ptr %inputImage, i64 %15
%16 = add nsw i64 %indvars.iv162, 4
- %17 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us = getelementptr i8, ptr %17, i64 -16
- %arrayidx70.us = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 0
- %18 = load <2 x i32>, ptr %arrayidx63.us, align 4, !tbaa !9
+ %17 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+ %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %17, i64 8
+ %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
+ %18 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
%19 = sitofp <2 x i32> %18 to <2 x float>
- %20 = load <2 x float>, ptr %arrayidx70.us, align 4, !tbaa !13
+ %20 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
%21 = fmul <2 x float> %20, %19
- %22 = extractelement <2 x float> %21, i32 0
- %add72.us = fadd float %sum_in_current_frame.0144.us, %22
- %23 = extractelement <2 x float> %21, i32 1
- %add72.us.1 = fadd float %add72.us, %23
- %24 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us.2 = getelementptr i8, ptr %24, i64 -8
- %arrayidx70.us.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 2
- %25 = load <2 x i32>, ptr %arrayidx63.us.2, align 4, !tbaa !9
- %26 = sitofp <2 x i32> %25 to <2 x float>
- %27 = load <2 x float>, ptr %arrayidx70.us.2, align 4, !tbaa !13
- %28 = fmul <2 x float> %27, %26
- %29 = extractelement <2 x float> %28, i32 0
- %add72.us.2 = fadd float %add72.us.1, %29
- %30 = extractelement <2 x float> %28, i32 1
- %add72.us.3 = fadd float %add72.us.2, %30
- %arrayidx63.us.4 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx70.us.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 4
- %31 = load <2 x i32>, ptr %arrayidx63.us.4, align 4, !tbaa !9
- %32 = sitofp <2 x i32> %31 to <2 x float>
- %33 = load <2 x float>, ptr %arrayidx70.us.4, align 4, !tbaa !13
- %34 = fmul <2 x float> %33, %32
- %35 = extractelement <2 x float> %34, i32 0
- %add72.us.4 = fadd float %add72.us.3, %35
- %36 = extractelement <2 x float> %34, i32 1
- %add72.us.5 = fadd float %add72.us.4, %36
- %37 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %37, i64 8
- %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
- %38 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
- %39 = sitofp <2 x i32> %38 to <2 x float>
- %40 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
- %41 = fmul <2 x float> %40, %39
- %42 = extractelement <2 x float> %41, i32 0
- %add72.us.6 = fadd float %add72.us.5, %42
- %43 = extractelement <2 x float> %41, i32 1
- %add72.us.7 = fadd float %add72.us.6, %43
- %44 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %44, i64 16
- %45 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
- %conv64.us.8 = sitofp i32 %45 to float
+ %22 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %21)
+ %23 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+ %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %23, i64 16
+ %24 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
+ %conv64.us.8 = sitofp i32 %24 to float
%arrayidx70.us.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 8
- %46 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
- %mul71.us.8 = fmul float %46, %conv64.us.8
- %add72.us.8 = fadd float %add72.us.7, %mul71.us.8
+ %25 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
+ %mul71.us.8 = fmul float %25, %conv64.us.8
+ %add72.us.8 = fadd float %22, %mul71.us.8
%indvars.iv.next163 = add nsw i64 %indvars.iv162, 1
%exitcond166.not = icmp eq i64 %indvars.iv.next163, 5
br i1 %exitcond166.not, label %for.cond.cleanup50.us, label %for.cond53.preheader.us, !llvm.loop !18
for.cond47.preheader.us: ; preds = %for.cond40.preheader.us, %for.cond.cleanup50.us
- %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %47, %for.cond.cleanup50.us ]
- %47 = add nuw nsw i64 %indvars.iv167, 1
+ %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %26, %for.cond.cleanup50.us ]
+ %26 = add nuw nsw i64 %indvars.iv167, 1
br label %for.cond53.preheader.us
for.cond40.for.cond.cleanup44_crit_edge.us: ; preds = %for.cond.cleanup50.us
@@ -41756,14 +40900,14 @@
br i1 %exitcond173.not, label %for.cond.cleanup38, label %for.cond40.preheader.us, !llvm.loop !19
for.cond.cleanup38: ; preds = %for.cond40.for.cond.cleanup44_crit_edge.us, %for.cond34.preheader
- call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #4
+ call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #5
ret void
}
|
…vectorizing operands (llvm#147583)" breaks spec accel2023 404 456 457 470 This reverts commit ac4a38e.
…ing operands (#147583) Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.
@alexey-bataev I see that you relanded this change, but don't see any description here or in the commit as to what changed. Can you briefly describe? |
Prevented the emission for the case, when it tried to emit reduction for non-immediate operands. |
This adds a custom lowering for v2f16 vecreduce.fadd to scalarize as opposed to padding with zeroes. This allows it to generate the more efficient faddp. Helps with #147583.
Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.