|
123 | 123 | #include "llvm/IR/User.h"
|
124 | 124 | #include "llvm/IR/Value.h"
|
125 | 125 | #include "llvm/IR/ValueHandle.h"
|
| 126 | +#include "llvm/IR/VectorBuilder.h" |
126 | 127 | #include "llvm/IR/Verifier.h"
|
127 | 128 | #include "llvm/Support/Casting.h"
|
128 | 129 | #include "llvm/Support/CommandLine.h"
|
@@ -247,10 +248,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
|
247 | 248 | clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
|
248 | 249 | "Create lane mask using active.lane.mask intrinsic, and use "
|
249 | 250 | "it for both data and control flow"),
|
250 |
| - clEnumValN( |
251 |
| - TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
252 |
| - "data-and-control-without-rt-check", |
253 |
| - "Similar to data-and-control, but remove the runtime check"))); |
| 251 | + clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
| 252 | + "data-and-control-without-rt-check", |
| 253 | + "Similar to data-and-control, but remove the runtime check"), |
| 254 | + clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", |
| 255 | + "Use predicated EVL instructions for tail folding if the " |
| 256 | + "target supports vector length predication"))); |
254 | 257 |
|
255 | 258 | static cl::opt<bool> MaximizeBandwidth(
|
256 | 259 | "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
@@ -1098,9 +1101,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
|
1098 | 1101 | // handled.
|
1099 | 1102 | if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
|
1100 | 1103 | isa<VPInterleaveRecipe>(CurRec) ||
|
1101 |
| - isa<VPScalarIVStepsRecipe>(CurRec) || |
1102 |
| - isa<VPCanonicalIVPHIRecipe>(CurRec) || |
1103 |
| - isa<VPActiveLaneMaskPHIRecipe>(CurRec)) |
| 1104 | + isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec)) |
1104 | 1105 | continue;
|
1105 | 1106 |
|
1106 | 1107 | // This recipe contributes to the address computation of a widen
|
@@ -1633,6 +1634,23 @@ class LoopVectorizationCostModel {
|
1633 | 1634 | return foldTailByMasking() || Legal->blockNeedsPredication(BB);
|
1634 | 1635 | }
|
1635 | 1636 |
|
| 1637 | + /// Returns true if VP intrinsics with explicit vector length support should |
| 1638 | + /// be generated in the tail folded loop. |
| 1639 | + bool useVPIWithVPEVLVectorization() const { |
| 1640 | + return PreferEVL && !EnableVPlanNativePath && |
| 1641 | + getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && |
| 1642 | + // FIXME: implement support for max safe dependency distance. |
| 1643 | + Legal->isSafeForAnyVectorWidth() && |
| 1644 | + // FIXME: remove this once reductions are supported. |
| 1645 | + Legal->getReductionVars().empty() && |
| 1646 | + // FIXME: remove this once vp_reverse is supported. |
| 1647 | + none_of( |
| 1648 | + WideningDecisions, |
| 1649 | + [](const std::pair<std::pair<Instruction *, ElementCount>, |
| 1650 | + std::pair<InstWidening, InstructionCost>> |
| 1651 | + &Data) { return Data.second.first == CM_Widen_Reverse; }); |
| 1652 | + } |
| 1653 | + |
1636 | 1654 | /// Returns true if the Phi is part of an inloop reduction.
|
1637 | 1655 | bool isInLoopReduction(PHINode *Phi) const {
|
1638 | 1656 | return InLoopReductions.contains(Phi);
|
@@ -1778,6 +1796,10 @@ class LoopVectorizationCostModel {
|
1778 | 1796 | /// All blocks of loop are to be masked to fold tail of scalar iterations.
|
1779 | 1797 | bool CanFoldTailByMasking = false;
|
1780 | 1798 |
|
| 1799 | + /// Control whether to generate VP intrinsics with explicit-vector-length |
| 1800 | + /// support in vectorized code. |
| 1801 | + bool PreferEVL = false; |
| 1802 | + |
1781 | 1803 | /// A map holding scalar costs for different vectorization factors. The
|
1782 | 1804 | /// presence of a cost for an instruction in the mapping indicates that the
|
1783 | 1805 | /// instruction will be scalarized when vectorizing with the associated
|
@@ -4733,6 +4755,41 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
4733 | 4755 | // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
|
4734 | 4756 | if (Legal->prepareToFoldTailByMasking()) {
|
4735 | 4757 | CanFoldTailByMasking = true;
|
| 4758 | + if (getTailFoldingStyle() == TailFoldingStyle::None) |
| 4759 | + return MaxFactors; |
| 4760 | + |
| 4761 | + if (UserIC > 1) { |
| 4762 | + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4763 | + "not generate VP intrinsics since interleave count " |
| 4764 | + "specified is greater than 1.\n"); |
| 4765 | + return MaxFactors; |
| 4766 | + } |
| 4767 | + |
| 4768 | + if (MaxFactors.ScalableVF.isVector()) { |
| 4769 | + assert(MaxFactors.ScalableVF.isScalable() && |
| 4770 | + "Expected scalable vector factor."); |
| 4771 | + // FIXME: use actual opcode/data type for analysis here. |
| 4772 | + PreferEVL = getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && |
| 4773 | + TTI.hasActiveVectorLength(0, nullptr, Align()); |
| 4774 | +#if !NDEBUG |
| 4775 | + if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { |
| 4776 | + if (PreferEVL) |
| 4777 | + dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4778 | + "try to generate VP Intrinsics.\n"; |
| 4779 | + else |
| 4780 | + dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4781 | + "not try to generate VP Intrinsics since the target " |
| 4782 | + "does not support vector length predication.\n"; |
| 4783 | + } |
| 4784 | +#endif // !NDEBUG |
| 4785 | + |
| 4786 | + // Tail folded loop using VP intrinsics restricts the VF to be scalable |
| 4787 | + // for now. |
| 4788 | + // TODO: extend it for fixed vectors, if required. |
| 4789 | + if (PreferEVL) |
| 4790 | + MaxFactors.FixedVF = ElementCount::getFixed(1); |
| 4791 | + } |
| 4792 | + |
4736 | 4793 | return MaxFactors;
|
4737 | 4794 | }
|
4738 | 4795 |
|
@@ -5342,6 +5399,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
|
5342 | 5399 | if (!isScalarEpilogueAllowed())
|
5343 | 5400 | return 1;
|
5344 | 5401 |
|
| 5402 | + // Do not interleave if EVL is preferred and no User IC is specified. |
| 5403 | + if (useVPIWithVPEVLVectorization()) |
| 5404 | + return 1; |
| 5405 | + |
5345 | 5406 | // We used the distance for the interleave count.
|
5346 | 5407 | if (!Legal->isSafeForAnyVectorWidth())
|
5347 | 5408 | return 1;
|
@@ -8596,6 +8657,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
|
8596 | 8657 | VPlanTransforms::truncateToMinimalBitwidths(
|
8597 | 8658 | *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
|
8598 | 8659 | VPlanTransforms::optimize(*Plan, *PSE.getSE());
|
| 8660 | + if (CM.useVPIWithVPEVLVectorization()) |
| 8661 | + VPlanTransforms::addExplicitVectorLength(*Plan); |
8599 | 8662 | assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
|
8600 | 8663 | VPlans.push_back(std::move(Plan));
|
8601 | 8664 | }
|
@@ -9451,6 +9514,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
|
9451 | 9514 | State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
|
9452 | 9515 | }
|
9453 | 9516 |
|
| 9517 | +/// Creates either vp_store or vp_scatter intrinsics calls to represent |
| 9518 | +/// predicated store/scatter. |
| 9519 | +static Instruction * |
| 9520 | +lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, |
| 9521 | + Value *StoredVal, bool IsScatter, Value *Mask, |
| 9522 | + Value *EVLPart, const Align &Alignment) { |
| 9523 | + CallInst *Call; |
| 9524 | + if (IsScatter) { |
| 9525 | + Call = Builder.CreateIntrinsic(Type::getVoidTy(EVLPart->getContext()), |
| 9526 | + Intrinsic::vp_scatter, |
| 9527 | + {StoredVal, Addr, Mask, EVLPart}); |
| 9528 | + } else { |
| 9529 | + VectorBuilder VBuilder(Builder); |
| 9530 | + VBuilder.setEVL(EVLPart).setMask(Mask); |
| 9531 | + Call = cast<CallInst>(VBuilder.createVectorInstruction( |
| 9532 | + Instruction::Store, Type::getVoidTy(EVLPart->getContext()), |
| 9533 | + {StoredVal, Addr})); |
| 9534 | + } |
| 9535 | + Call->addParamAttr( |
| 9536 | + 1, Attribute::getWithAlignment(Call->getContext(), Alignment)); |
| 9537 | + return Call; |
| 9538 | +} |
| 9539 | + |
| 9540 | +/// Creates either vp_load or vp_gather intrinsics calls to represent |
| 9541 | +/// predicated load/gather. |
| 9542 | +static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, |
| 9543 | + VectorType *DataTy, |
| 9544 | + Value *Addr, bool IsGather, |
| 9545 | + Value *Mask, Value *EVLPart, |
| 9546 | + const Align &Alignment) { |
| 9547 | + CallInst *Call; |
| 9548 | + if (IsGather) { |
| 9549 | + Call = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, |
| 9550 | + {Addr, Mask, EVLPart}, nullptr, |
| 9551 | + "wide.masked.gather"); |
| 9552 | + } else { |
| 9553 | + VectorBuilder VBuilder(Builder); |
| 9554 | + VBuilder.setEVL(EVLPart).setMask(Mask); |
| 9555 | + Call = cast<CallInst>(VBuilder.createVectorInstruction( |
| 9556 | + Instruction::Load, DataTy, Addr, "vp.op.load")); |
| 9557 | + } |
| 9558 | + Call->addParamAttr( |
| 9559 | + 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); |
| 9560 | + return Call; |
| 9561 | +} |
| 9562 | + |
9454 | 9563 | void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9455 | 9564 | VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
|
9456 | 9565 |
|
@@ -9482,14 +9591,31 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9482 | 9591 | }
|
9483 | 9592 | }
|
9484 | 9593 |
|
| 9594 | + auto MaskValue = [&](unsigned Part) -> Value * { |
| 9595 | + if (isMaskRequired) |
| 9596 | + return BlockInMaskParts[Part]; |
| 9597 | + return nullptr; |
| 9598 | + }; |
| 9599 | + |
9485 | 9600 | // Handle Stores:
|
9486 | 9601 | if (SI) {
|
9487 | 9602 | State.setDebugLocFrom(SI->getDebugLoc());
|
9488 | 9603 |
|
9489 | 9604 | for (unsigned Part = 0; Part < State.UF; ++Part) {
|
9490 | 9605 | Instruction *NewSI = nullptr;
|
9491 | 9606 | Value *StoredVal = State.get(StoredValue, Part);
|
9492 |
| - if (CreateGatherScatter) { |
| 9607 | + if (State.EVL) { |
| 9608 | + Value *EVLPart = State.get(State.EVL, Part); |
| 9609 | + // If EVL is not nullptr, then EVL must be a valid value set during plan |
| 9610 | + // creation, possibly default value = whole vector register length. EVL |
| 9611 | + // is created only if TTI prefers predicated vectorization, thus if EVL |
| 9612 | + // is not nullptr it also implies preference for predicated |
| 9613 | + // vectorization. |
| 9614 | + // FIXME: Support reverse store after vp_reverse is added. |
| 9615 | + NewSI = lowerStoreUsingVectorIntrinsics( |
| 9616 | + Builder, State.get(getAddr(), Part), StoredVal, CreateGatherScatter, |
| 9617 | + MaskValue(Part), EVLPart, Alignment); |
| 9618 | + } else if (CreateGatherScatter) { |
9493 | 9619 | Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
|
9494 | 9620 | Value *VectorGep = State.get(getAddr(), Part);
|
9495 | 9621 | NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
|
@@ -9519,7 +9645,18 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9519 | 9645 | State.setDebugLocFrom(LI->getDebugLoc());
|
9520 | 9646 | for (unsigned Part = 0; Part < State.UF; ++Part) {
|
9521 | 9647 | Value *NewLI;
|
9522 |
| - if (CreateGatherScatter) { |
| 9648 | + if (State.EVL) { |
| 9649 | + Value *EVLPart = State.get(State.EVL, Part); |
| 9650 | + // If EVL is not nullptr, then EVL must be a valid value set during plan |
| 9651 | + // creation, possibly default value = whole vector register length. EVL |
| 9652 | + // is created only if TTI prefers predicated vectorization, thus if EVL |
| 9653 | + // is not nullptr it also implies preference for predicated |
| 9654 | + // vectorization. |
| 9655 | + // FIXME: Support reverse loading after vp_reverse is added. |
| 9656 | + NewLI = lowerLoadUsingVectorIntrinsics( |
| 9657 | + Builder, DataTy, State.get(getAddr(), Part), CreateGatherScatter, |
| 9658 | + MaskValue(Part), EVLPart, Alignment); |
| 9659 | + } else if (CreateGatherScatter) { |
9523 | 9660 | Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
|
9524 | 9661 | Value *VectorGep = State.get(getAddr(), Part);
|
9525 | 9662 | NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
|
|
0 commit comments