diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 93f6d1d82e244..de8dc8600e58b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -226,7 +226,7 @@ class VPBuilder { /// TODO: The following VectorizationFactor was pulled out of /// LoopVectorizationCostModel class. LV also deals with -/// VectorizerParams::VectorizationFactor and VectorizationCostTy. +/// VectorizerParams::VectorizationFactor. /// We need to streamline them. /// Information about vectorization costs. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1423deb5a73f9..417c577e3c5dc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1090,7 +1090,7 @@ class LoopVectorizationCostModel { bool selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); - return expectedCost(UserVF).first.isValid(); + return expectedCost(UserVF).isValid(); } /// \return The size (in bits) of the smallest and widest types in the code @@ -1591,20 +1591,13 @@ class LoopVectorizationCostModel { Scalars.clear(); } - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. If \p Invalid is not nullptr, this function /// will add a pair(Instruction*, ElementCount) to \p Invalid for /// each instruction that has an Invalid cost for the given VF. - VectorizationCostTy + InstructionCost expectedCost(ElementCount VF, SmallVectorImpl *Invalid = nullptr); @@ -1642,12 +1635,7 @@ class LoopVectorizationCostModel { /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - - /// The cost-computation logic from getInstructionCost which provides - /// the vector type as an output parameter. - InstructionCost getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy); + InstructionCost getInstructionCost(Instruction *I, ElementCount VF); /// Return the cost of instructions in an inloop reduction pattern, if I is /// part of that pattern. @@ -4795,9 +4783,101 @@ static void emitInvalidCostRemarks(SmallVector InvalidCosts, } while (!Tail.empty()); } +/// Check if any recipe of \p Plan will generate a vector value, which will be +/// assigned a vector register. +static bool willGenerateVectors(VPlan &Plan, ElementCount VF, + const TargetTransformInfo &TTI) { + assert(VF.isVector() && "Checking a scalar VF?"); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), + Plan.getCanonicalIV()->getScalarType()->getContext()); + // Set of already visited types. + DenseSet Visited; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + // Continue early if the recipe is considered to not produce a vector + // result. Note that this includes VPInstruction where some opcodes may + // produce a vector, to preserve existing behavior as VPInstructions model + // aspects not directly mapped to existing IR instructions. + switch (R.getVPDefID()) { + case VPDef::VPDerivedIVSC: + case VPDef::VPScalarIVStepsSC: + case VPDef::VPScalarCastSC: + case VPDef::VPReplicateSC: + case VPDef::VPInstructionSC: + case VPDef::VPCanonicalIVPHISC: + case VPDef::VPVectorPointerSC: + case VPDef::VPExpandSCEVSC: + case VPDef::VPEVLBasedIVPHISC: + case VPDef::VPPredInstPHISC: + case VPDef::VPBranchOnMaskSC: + continue; + case VPDef::VPReductionSC: + case VPDef::VPActiveLaneMaskPHISC: + case VPDef::VPWidenCallSC: + case VPDef::VPWidenCanonicalIVSC: + case VPDef::VPWidenCastSC: + case VPDef::VPWidenGEPSC: + case VPDef::VPWidenSC: + case VPDef::VPWidenSelectSC: + case VPDef::VPBlendSC: + case VPDef::VPFirstOrderRecurrencePHISC: + case VPDef::VPWidenPHISC: + case VPDef::VPWidenIntOrFpInductionSC: + case VPDef::VPWidenPointerInductionSC: + case VPDef::VPReductionPHISC: + case VPDef::VPInterleaveSC: + case VPDef::VPWidenLoadEVLSC: + case VPDef::VPWidenLoadSC: + case VPDef::VPWidenStoreEVLSC: + case VPDef::VPWidenStoreSC: + break; + default: + llvm_unreachable("unhandled recipe"); + } + + auto WillWiden = [&TTI, VF](Type *ScalarTy) { + Type *VectorTy = ToVectorTy(ScalarTy, VF); + unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); + if (!NumLegalParts) + return false; + if (VF.isScalable()) { + // is assumed to be profitable over iN because + // scalable registers are a distinct register class from scalar + // ones. If we ever find a target which wants to lower scalable + // vectors back to scalars, we'll need to update this code to + // explicitly ask TTI about the register class uses for each part. + return NumLegalParts <= VF.getKnownMinValue(); + } + // Two or more parts that share a register - are vectorized. + return NumLegalParts < VF.getKnownMinValue(); + }; + + // If no def nor is a store, e.g., branches, continue - no value to check. + if (R.getNumDefinedValues() == 0 && + !isa( + &R)) + continue; + // For multi-def recipes, currently only interleaved loads, suffice to + // check first def only. + // For stores check their stored value; for interleaved stores suffice + // the check first stored value only. In all cases this is the second + // operand. + VPValue *ToCheck = + R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); + Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); + if (!Visited.insert({ScalarTy}).second) + continue; + if (WillWiden(ScalarTy)) + return true; + } + } + + return false; +} + VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { - InstructionCost ExpectedCost = - CM.expectedCost(ElementCount::getFixed(1)).first; + InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); assert(any_of(VPlans, @@ -4826,9 +4906,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (VF.isScalar()) continue; - LoopVectorizationCostModel::VectorizationCostTy C = - CM.expectedCost(VF, &InvalidCosts); - VectorizationFactor Candidate(VF, C.first, ScalarCost.ScalarCost); + InstructionCost C = CM.expectedCost(VF, &InvalidCosts); + VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = @@ -4845,7 +4924,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { LLVM_DEBUG(dbgs() << ".\n"); #endif - if (!C.second && !ForceVectorization) { + if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { LLVM_DEBUG( dbgs() << "LV: Not considering vector loop of width " << VF @@ -5146,7 +5225,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - LoopCost = expectedCost(VF).first; + LoopCost = expectedCost(VF); assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. @@ -5717,15 +5796,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. - InstructionCost VectorCost = getInstructionCost(I, VF).first; + InstructionCost VectorCost = getInstructionCost(I, VF); // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. InstructionCost ScalarCost = - VF.getFixedValue() * - getInstructionCost(I, ElementCount::getFixed(1)).first; + VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. @@ -5769,14 +5847,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( return Discount; } -LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost( +InstructionCost LoopVectorizationCostModel::expectedCost( ElementCount VF, SmallVectorImpl *Invalid) { - VectorizationCostTy Cost; + InstructionCost Cost; // For each block. for (BasicBlock *BB : TheLoop->blocks()) { - VectorizationCostTy BlockCost; + InstructionCost BlockCost; // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -5785,22 +5862,19 @@ LoopVectorizationCostModel::expectedCost( (VF.isVector() && VecValuesToIgnore.count(&I))) continue; - VectorizationCostTy C = getInstructionCost(&I, VF); + InstructionCost C = getInstructionCost(&I, VF); // Check if we should override the cost. - if (C.first.isValid() && - ForceTargetInstructionCost.getNumOccurrences() > 0) - C.first = InstructionCost(ForceTargetInstructionCost); + if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) + C = InstructionCost(ForceTargetInstructionCost); // Keep a list of instructions with invalid costs. - if (Invalid && !C.first.isValid()) + if (Invalid && !C.isValid()) Invalid->emplace_back(&I, VF); - BlockCost.first += C.first; - BlockCost.second |= C.second; - LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first - << " for VF " << VF << " For instruction: " << I - << '\n'); + BlockCost += C; + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " + << VF << " For instruction: " << I << '\n'); } // If we are vectorizing a predicated block, it will have been @@ -5811,10 +5885,9 @@ LoopVectorizationCostModel::expectedCost( // cost by the probability of executing it. blockNeedsPredication from // Legal is used so as to not include all blocks in tail folded loops. if (VF.isScalar() && Legal->blockNeedsPredication(BB)) - BlockCost.first /= getReciprocalPredBlockProb(); + BlockCost /= getReciprocalPredBlockProb(); - Cost.first += BlockCost.first; - Cost.second |= BlockCost.second; + Cost += BlockCost; } return Cost; @@ -6213,49 +6286,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, return getWideningCost(I, VF); } -LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, - ElementCount VF) { - // If we know that this instruction will remain uniform, check the cost of - // the scalar version. - if (isUniformAfterVectorization(I, VF)) - VF = ElementCount::getFixed(1); - - if (VF.isVector() && isProfitableToScalarize(I, VF)) - return VectorizationCostTy(InstsToScalarize[VF][I], false); - - // Forced scalars do not have any scalarization overhead. - auto ForcedScalar = ForcedScalars.find(VF); - if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { - auto InstSet = ForcedScalar->second; - if (InstSet.count(I)) - return VectorizationCostTy( - (getInstructionCost(I, ElementCount::getFixed(1)).first * - VF.getKnownMinValue()), - false); - } - - Type *VectorTy; - InstructionCost C = getInstructionCost(I, VF, VectorTy); - - bool TypeNotScalarized = false; - if (VF.isVector() && VectorTy->isVectorTy()) { - if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { - if (VF.isScalable()) - // is assumed to be profitable over iN because - // scalable registers are a distinct register class from scalar ones. - // If we ever find a target which wants to lower scalable vectors - // back to scalars, we'll need to update this code to explicitly - // ask TTI about the register class uses for each part. - TypeNotScalarized = NumParts <= VF.getKnownMinValue(); - else - TypeNotScalarized = NumParts < VF.getKnownMinValue(); - } else - C = InstructionCost::getInvalid(); - } - return VectorizationCostTy(C, TypeNotScalarized); -} - InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { @@ -6646,8 +6676,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { } InstructionCost -LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { + // If we know that this instruction will remain uniform, check the cost of + // the scalar version. + if (isUniformAfterVectorization(I, VF)) + VF = ElementCount::getFixed(1); + + if (VF.isVector() && isProfitableToScalarize(I, VF)) + return InstsToScalarize[VF][I]; + + // Forced scalars do not have any scalarization overhead. + auto ForcedScalar = ForcedScalars.find(VF); + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { + auto InstSet = ForcedScalar->second; + if (InstSet.count(I)) + return getInstructionCost(I, ElementCount::getFixed(1)) * + VF.getKnownMinValue(); + } + Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); @@ -6670,6 +6717,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, }; (void) hasSingleCopyAfterVectorization; + Type *VectorTy; if (isScalarAfterVectorization(I, VF)) { // With the exception of GEPs and PHIs, after scalarization there should // only be one copy of the instruction generated in the loop. This is @@ -6685,6 +6733,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } else VectorTy = ToVectorTy(RetTy, VF); + if (VF.isVector() && VectorTy->isVectorTy() && + !TTI.getNumberOfParts(VectorTy)) + return InstructionCost::getInvalid(); + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index eca5d1d4c5e1d..c4b096d653158 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -54,6 +54,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return ResTy; } case Instruction::ICmp: + case VPInstruction::ActiveLaneMask: + return inferScalarType(R->getOperand(1)); case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::Not: return SetResultTyFromOp(); @@ -68,6 +70,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::PtrAdd: // Return the type based on the pointer argument (i.e. first operand). return inferScalarType(R->getOperand(0)); + case VPInstruction::BranchOnCond: + case VPInstruction::BranchOnCount: + return Type::getVoidTy(Ctx); default: break; } @@ -237,19 +242,21 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { Type *ResultTy = TypeSwitch(V->getDefiningRecipe()) - .Case([this](const auto *R) { - // Handle header phi recipes, except VPWidenIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + .Case( + [this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case( [](const auto *R) { return R->getScalarType(); }) - .Case([this](const VPRecipeBase *R) { + .Case([this](const VPRecipeBase *R) { return inferScalarType(R->getOperand(0)); }) .Case