From ee551cf0a2b69d519da36fb9bcb5f4f1cce3c74c Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 21 Aug 2025 15:03:50 +0100 Subject: [PATCH 1/2] [VPlan] Introduce m_Cmp; match more compares Extend [Specific]Cmp_match to handle floating-point compares, and introduce m_Cmp that matches both integer and floating-point compares. Use it in simplifyRecipe to match and simplify the general case of compares. --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 + .../Transforms/Vectorize/VPlanPatternMatch.h | 67 ++++++++++---- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 + .../Transforms/Vectorize/VPlanTransforms.cpp | 34 ++++--- .../LoopVectorize/AArch64/masked-call.ll | 24 +++-- .../Transforms/LoopVectorize/select-cmp.ll | 91 ++++++++----------- 6 files changed, 120 insertions(+), 102 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6814dc5de6716..d6bc462a0dfab 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -805,6 +805,9 @@ class VPIRFlags { GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; } + /// Returns true if the recipe has a comparison predicate. + bool hasPredicate() const { return OpType == OperationType::Cmp; } + /// Returns true if the recipe has fast-math flags. bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 490c5c4cce797..de3661953f90c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -392,16 +392,24 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { return m_c_Binary(Op0, Op1); } -/// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison -/// predicate. -template struct ICmp_match { +/// Cmp_match is a variant of BinaryRecipe_match that also binds the comparison +/// predicate. Opcodes must either be Instruction::ICmp or Instruction::FCmp, or +/// both. +template +struct Cmp_match { + static_assert((sizeof...(Opcodes) == 1 || sizeof...(Opcodes) == 2) && + "Expected one or two opcodes"); + static_assert( + ((Opcodes == Instruction::ICmp || Opcodes == Instruction::FCmp) && ...) && + "Expected a compare instruction opcode"); + CmpPredicate *Predicate = nullptr; Op0_t Op0; Op1_t Op1; - ICmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) + Cmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) : Predicate(&Pred), Op0(Op0), Op1(Op1) {} - ICmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {} + Cmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {} bool match(const VPValue *V) const { auto *DefR = V->getDefiningRecipe(); @@ -409,7 +417,7 @@ template struct ICmp_match { } bool match(const VPRecipeBase *V) const { - if (m_Binary(Op0, Op1).match(V)) { + if ((m_Binary(Op0, Op1).match(V) || ...)) { if (Predicate) *Predicate = cast(V)->getPredicate(); return true; @@ -418,38 +426,63 @@ template struct ICmp_match { } }; -/// SpecificICmp_match is a variant of ICmp_match that matches the comparison +/// SpecificCmp_match is a variant of Cmp_match that matches the comparison /// predicate, instead of binding it. -template struct SpecificICmp_match { +template +struct SpecificCmp_match { const CmpPredicate Predicate; Op0_t Op0; Op1_t Op1; - SpecificICmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS) + SpecificCmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS) : Predicate(Pred), Op0(LHS), Op1(RHS) {} bool match(const VPValue *V) const { CmpPredicate CurrentPred; - return ICmp_match(CurrentPred, Op0, Op1).match(V) && + return Cmp_match(CurrentPred, Op0, Op1) + .match(V) && CmpPredicate::getMatching(CurrentPred, Predicate); } }; template -inline ICmp_match m_ICmp(const Op0_t &Op0, const Op1_t &Op1) { - return ICmp_match(Op0, Op1); +inline Cmp_match m_ICmp(const Op0_t &Op0, + const Op1_t &Op1) { + return Cmp_match(Op0, Op1); } template -inline ICmp_match m_ICmp(CmpPredicate &Pred, const Op0_t &Op0, - const Op1_t &Op1) { - return ICmp_match(Pred, Op0, Op1); +inline Cmp_match +m_ICmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) { + return Cmp_match(Pred, Op0, Op1); } template -inline SpecificICmp_match +inline SpecificCmp_match m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) { - return SpecificICmp_match(MatchPred, Op0, Op1); + return SpecificCmp_match(MatchPred, Op0, + Op1); +} + +template +inline Cmp_match +m_Cmp(const Op0_t &Op0, const Op1_t &Op1) { + return Cmp_match(Op0, + Op1); +} + +template +inline Cmp_match +m_Cmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) { + return Cmp_match( + Pred, Op0, Op1); +} + +template +inline SpecificCmp_match +m_SpecificCmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) { + return SpecificCmp_match( + MatchPred, Op0, Op1); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a92540f457c73..c4fdcccc6d62b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2930,6 +2930,9 @@ static void scalarizeInstruction(const Instruction *Instr, RepRecipe->applyFlags(*Cloned); RepRecipe->applyMetadata(*Cloned); + if (RepRecipe->hasPredicate()) + cast(Cloned)->setPredicate(RepRecipe->getPredicate()); + if (auto DL = RepRecipe->getDebugLoc()) State.setDebugLocFrom(DL); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 677d97e54d556..352cb6caf67db 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1106,33 +1106,31 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(A); // Try to fold Not into compares by adjusting the predicate in-place. - if (auto *WideCmp = dyn_cast(A)) { - if ((WideCmp->getOpcode() == Instruction::ICmp || - WideCmp->getOpcode() == Instruction::FCmp) && - all_of(WideCmp->users(), [&WideCmp](VPUser *U) { - return match(U, m_CombineOr(m_Not(m_Specific(WideCmp)), - m_Select(m_Specific(WideCmp), - m_VPValue(), m_VPValue()))); + CmpPredicate Pred; + if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) { + auto *Cmp = cast(A); + if (all_of(Cmp->users(), [&Cmp](VPUser *U) { + return match(U, m_CombineOr(m_Not(m_Specific(Cmp)), + m_Select(m_Specific(Cmp), m_VPValue(), + m_VPValue()))); })) { - WideCmp->setPredicate( - CmpInst::getInversePredicate(WideCmp->getPredicate())); - for (VPUser *U : to_vector(WideCmp->users())) { + Cmp->setPredicate(CmpInst::getInversePredicate(Pred)); + for (VPUser *U : to_vector(Cmp->users())) { auto *R = cast(U); - if (match(R, m_Select(m_Specific(WideCmp), m_VPValue(X), - m_VPValue(Y)))) { + if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) { // select (cmp pred), x, y -> select (cmp inv_pred), y, x R->setOperand(1, Y); R->setOperand(2, X); } else { // not (cmp pred) -> cmp inv_pred - assert(match(R, m_Not(m_Specific(WideCmp))) && "Unexpected user"); - R->replaceAllUsesWith(WideCmp); + assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user"); + R->replaceAllUsesWith(Cmp); } } - // If WideCmp doesn't have a debug location, use the one from the - // negation, to preserve the location. - if (!WideCmp->getDebugLoc() && R.getDebugLoc()) - WideCmp->setDebugLoc(R.getDebugLoc()); + // If Cmp doesn't have a debug location, use the one from the negation, + // to preserve the location. + if (!Cmp->getDebugLoc() && R.getDebugLoc()) + Cmp->setDebugLoc(R.getDebugLoc()); } } } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 11bb4d234f3f3..e5697e121e9e9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -973,18 +973,16 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]] ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: -; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP19:.*]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP19]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP19]] ] +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP18:.*]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP18]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP18]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]] -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = xor i1 [[TMP7]], true -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = xor i1 [[TMP8]], true -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP9]], i1 false -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP10]], i1 false +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = fcmp ule double [[TMP5]], 0.000000e+00 +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ule double [[TMP6]], 0.000000e+00 +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP7]], i1 false +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP8]], i1 false ; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00 ; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP12]], double 1.000000e+00, double 0.000000e+00 ; TFA_INTERLEAVE-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], double [[PREDPHI3]], double [[PREDPHI]] @@ -993,11 +991,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor i1 [[TMP13]], true ; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor i1 [[TMP14]], true ; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]] -; TFA_INTERLEAVE-NEXT: br i1 [[TMP17]], label %[[BB18:.*]], label %[[TMP19]] -; TFA_INTERLEAVE: [[BB18]]: +; TFA_INTERLEAVE-NEXT: br i1 [[TMP17]], label %[[BB16:.*]], label %[[TMP18]] +; TFA_INTERLEAVE: [[BB16]]: ; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8 -; TFA_INTERLEAVE-NEXT: br label %[[TMP19]] -; TFA_INTERLEAVE: [[TMP19]]: +; TFA_INTERLEAVE-NEXT: br label %[[TMP18]] +; TFA_INTERLEAVE: [[TMP18]]: ; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index 5e48b1f72b111..e4922d3e4f627 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -134,18 +134,14 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3 +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3 +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3 +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3 +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -510,18 +506,14 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3 +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3 +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3 +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3 +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -700,18 +692,14 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp fast one float [[TMP8]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp fast one float [[TMP9]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp fast one float [[TMP10]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp fast one float [[TMP11]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -890,18 +878,14 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp one float [[TMP8]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp one float [[TMP9]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp one float [[TMP10]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp one float [[TMP11]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -1049,8 +1033,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-VF1IC4: [[VECTOR_PH]]: ; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = icmp eq i32 [[A]], 3 -; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = xor i1 [[TMP0]], true +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = icmp ne i32 [[A]], 3 ; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF1IC4: [[VECTOR_BODY]]: ; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -1058,10 +1041,10 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] -; CHECK-VF1IC4-NEXT: [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]] -; CHECK-VF1IC4-NEXT: [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]] -; CHECK-VF1IC4-NEXT: [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]] -; CHECK-VF1IC4-NEXT: [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]] +; CHECK-VF1IC4-NEXT: [[TMP5]] = or i1 [[VEC_PHI]], [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP0]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] From 48acbb699b78fd4db86bf32e4293901824aec760 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 22 Aug 2025 13:57:14 +0100 Subject: [PATCH 2/2] [LV] Fix for planContainsAdditionalSimplifications --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c4110582da1ef..d4b06c3933b90 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7002,12 +7002,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, if (Instruction *UI = GetInstructionForCost(&R)) { // If we adjusted the predicate of the recipe, the cost in the legacy // cost model may be different. - if (auto *WidenCmp = dyn_cast(&R)) { - if ((WidenCmp->getOpcode() == Instruction::ICmp || - WidenCmp->getOpcode() == Instruction::FCmp) && - WidenCmp->getPredicate() != cast(UI)->getPredicate()) - return true; - } + using namespace VPlanPatternMatch; + CmpPredicate Pred; + if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) && + cast(R).getPredicate() != + cast(UI)->getPredicate()) + return true; SeenInstrs.insert(UI); } }