diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5073622a09553..9884dcb97ec5c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1520,7 +1520,7 @@ class LoopVectorizationCostModel { /// \p Multiplier is an aditional scaling factor applied to VF before /// comparing to EpilogueVectorizationMinVF. bool isEpilogueVectorizationProfitable(const ElementCount VF, - const unsigned Multiplier) const; + const unsigned IC) const; /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -4292,6 +4292,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { return TTI.getVScaleForTuning(); } +/// This function attempts to return a value that represents the vectorization +/// factor at runtime. For fixed-width VFs we know this precisely at compile +/// time, but for scalable VFs we calculate it based on an estimate of the +/// vscale value. +static unsigned getEstimatedRuntimeVF(const Loop *L, + const TargetTransformInfo &TTI, + ElementCount VF) { + unsigned EstimatedVF = VF.getKnownMinValue(); + if (VF.isScalable()) + if (std::optional VScale = getVScaleForTuning(L, TTI)) + EstimatedVF *= *VScale; + assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); + return EstimatedVF; +} + bool LoopVectorizationPlanner::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B, const unsigned MaxTripCount) const { @@ -4594,17 +4609,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost C = CM.expectedCost(VF); VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); - unsigned AssumedMinimumVscale = - getVScaleForTuning(OrigLoop, TTI).value_or(1); - unsigned Width = - Candidate.Width.isScalable() - ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale - : Candidate.Width.getFixedValue(); + unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF << " costs: " << (Candidate.Cost / Width)); if (VF.isScalable()) LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " - << AssumedMinimumVscale << ")"); + << getVScaleForTuning(OrigLoop, TTI).value_or(1) + << ")"); LLVM_DEBUG(dbgs() << ".\n"); if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { @@ -4670,7 +4681,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( } bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( - const ElementCount VF, const unsigned Multiplier) const { + const ElementCount VF, const unsigned IC) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into // account. For now we apply a very crude heuristic and only consider loops @@ -4685,9 +4696,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) - return true; - return false; + // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable + // VFs when deciding profitability. + // See related "TODO: extend to support scalable VFs." in + // selectEpilogueVectorizationFactor. + unsigned Multiplier = VF.isFixed() ? IC : 1; + return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= + EpilogueVectorizationMinVF; } VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( @@ -4730,11 +4745,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } - unsigned Multiplier = IC; - if (MainLoopVF.isScalable()) - Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1); - - if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) { + if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -4743,12 +4754,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. - ElementCount EstimatedRuntimeVF = MainLoopVF; - if (MainLoopVF.isScalable()) { - EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (std::optional VScale = getVScaleForTuning(OrigLoop, TTI)) - EstimatedRuntimeVF *= *VScale; - } + ElementCount EstimatedRuntimeVF = + ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF)); ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); @@ -4988,13 +4995,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - unsigned EstimatedVF = VF.getKnownMinValue(); - if (VF.isScalable()) { - if (std::optional VScale = getVScaleForTuning(TheLoop, TTI)) - EstimatedVF *= *VScale; - } - assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); - + unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF); unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); if (KnownTC > 0) { // At least one iteration must be scalar when this constraint holds. So the @@ -7426,10 +7427,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, // Now compute and add the VPlan-based cost. Cost += Plan.cost(VF, CostCtx); #ifndef NDEBUG - unsigned EstimatedWidth = VF.getKnownMinValue(); - if (VF.isScalable()) - if (std::optional VScale = getVScaleForTuning(OrigLoop, TTI)) - EstimatedWidth *= *VScale; + unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << " (Estimated cost per lane: "); if (Cost.isValid()) { @@ -9811,8 +9809,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { } static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, - VectorizationFactor &VF, - std::optional VScale, Loop *L, + VectorizationFactor &VF, Loop *L, + const TargetTransformInfo &TTI, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL) { InstructionCost CheckCost = Checks.getCost(); @@ -9864,13 +9862,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. - unsigned IntVF = VF.Width.getKnownMinValue(); - if (VF.Width.isScalable()) { - unsigned AssumedMinimumVscale = 1; - if (VScale) - AssumedMinimumVscale = *VScale; - IntVF *= AssumedMinimumVscale; - } + unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width); uint64_t RtC = *CheckCost.getValue(); uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); @@ -10119,8 +10111,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, - PSE, SEL)) { + !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),