From 5e13990f079ac632eb0dfeb011db6deb183821c4 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 16 May 2025 16:38:40 +0000 Subject: [PATCH 1/4] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 395 ++++++++++++++++-- .../X86/buildvector-schedule-for-subvector.ll | 5 +- .../X86/full-match-with-poison-scalar.ll | 14 +- ...dulable-instructions-become-schedulable.ll | 16 +- .../Transforms/SLPVectorizer/X86/pr47642.ll | 9 +- .../SLPVectorizer/alternate-non-profitable.ll | 6 +- 6 files changed, 368 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c63f80675fef4..97d6068571918 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -206,6 +206,12 @@ static cl::opt VectorizeNonPowerOf2( "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements.")); +/// Enables vectorization of copyable elements. +static cl::opt VectorizeCopyableElements( + "slp-copyable-elements", cl::init(true), cl::Hidden, + cl::desc("Try to replace values with the idempotent instructions for " + "better vectorization.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -835,6 +841,13 @@ static std::optional getExtractIndex(const Instruction *E) { return *EI->idx_begin(); } +namespace llvm { +/// Checks if the specified value does not require scheduling. It does not +/// require scheduling if all operands and all users do not need to be scheduled +/// in the current basic block. +static bool doesNotNeedToBeScheduled(Value *V); +} // namespace llvm + namespace { /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. @@ -1170,9 +1183,11 @@ class InstructionsState { if (!I->isBinaryOp()) return nullptr; BinOpSameOpcodeHelper Converter(MainOp); - if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp()) - return MainOp; - return AltOp; + if (!Converter.add(I) || !Converter.add(MainOp)) + return nullptr; + if (Converter.hasAltOp() && !isAltShuffle()) + return nullptr; + return Converter.hasAltOp() ? AltOp : MainOp; } /// Checks if main/alt instructions are shift operations. @@ -1220,6 +1235,48 @@ class InstructionsState { InstructionsState(Instruction *MainOp, Instruction *AltOp) : MainOp(MainOp), AltOp(AltOp) {} static InstructionsState invalid() { return {nullptr, nullptr}; } + + bool isCopyableElement(Value *V) const { + assert(valid() && "InstructionsState is invalid."); + if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr) + return false; + auto *I = dyn_cast(V); + if (!I && isa(V)) + return false; + // FIXME: remove doesNotNeedToBeScheduled() and isa() check once + // scheduling is supported. + return !I || + (I->getParent() != MainOp->getParent() && + (!isVectorLikeInstWithConstOps(I) || + !isVectorLikeInstWithConstOps(MainOp))) || + (I->getOpcode() != MainOp->getOpcode() && + (isa(I) || doesNotNeedToBeScheduled(I)) && + (!I->isBinaryOp() || getMatchingMainOpOrAltOp(I) != MainOp)); + } + + bool areInstructionsWithCopyableElements(ArrayRef VL) const { + assert(valid() && "InstructionsState is invalid."); + bool HasAtLeastOneCopyableElement = false; + auto IsCopyableElement = [&](Value *V) { + bool IsCopyable = isCopyableElement(V); + HasAtLeastOneCopyableElement |= IsCopyable; + return IsCopyable; + }; + return !isAltShuffle() && all_of(VL, [&](Value *V) { + if (V == MainOp || isa(V)) + return true; + if (IsCopyableElement(V)) + return true; + auto *I = dyn_cast(V); + if (getOpcode() == Instruction::GetElementPtr && !I) + return true; + return I->getType() == MainOp->getType() && + (I->getParent() == MainOp->getParent() || + (isVectorLikeInstWithConstOps(I) && + isVectorLikeInstWithConstOps(MainOp))) && + getMatchingMainOpOrAltOp(cast(V)) == MainOp; + }) && HasAtLeastOneCopyableElement; + } }; std::pair> @@ -2878,9 +2935,6 @@ class BoUpSLP { for (OperandDataVec &Ops : OpsVec) Ops.resize(NumLanes); for (unsigned Lane : seq(NumLanes)) { - Value *V = VL[Lane]; - assert((isa(V) || isa(V)) && - "Expected instruction or poison value"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of V and whether the operand at OpIdx is the LHS or RHS @@ -2891,13 +2945,20 @@ class BoUpSLP { // Since operand reordering is performed on groups of commutative // operations or alternating sequences (e.g., +, -), we can safely tell // the inverse operations by checking commutativity. - if (isa(V)) { + auto *I = dyn_cast(VL[Lane]); + if (!I && isa(VL[Lane])) { for (unsigned OpIdx : seq(NumOperands)) OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false}; continue; } - auto [SelectedOp, Ops] = convertTo(cast(V), S); - bool IsInverseOperation = !isCommutative(SelectedOp); + bool IsInverseOperation = false; + if (S.isCopyableElement(VL[Lane])) { + // The value is a copyable element. + IsInverseOperation = !isCommutative(MainOp); + } else { + auto [SelectedOp, Ops] = convertTo(I, S); + IsInverseOperation = !isCommutative(SelectedOp); + } for (unsigned OpIdx : seq(ArgSize)) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false}; @@ -3905,6 +3966,14 @@ class BoUpSLP { bool hasState() const { return S.valid(); } + /// Returns true if \p V is a copyable element. + bool isCopyableElement(Value *V) const { return S.isCopyableElement(V); } + + /// Returns true if any scalar in the list is a copyable element. + bool hasCopyableElements() const { + return S.areInstructionsWithCopyableElements(Scalars); + } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. int findLaneForValue(Value *V) const { @@ -4153,7 +4222,7 @@ class BoUpSLP { } else if (!Last->isGather()) { SmallPtrSet Processed; for (Value *V : VL) { - if (isa(V)) + if (isa(V) || S.isCopyableElement(V)) continue; auto It = ScalarToTreeEntries.find(V); if (It == ScalarToTreeEntries.end()) { @@ -4168,14 +4237,20 @@ class BoUpSLP { // Update the scheduler bundle to point to this TreeEntry. assert((!Bundle.getBundle().empty() || isa(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp()) || - doesNotNeedToSchedule(VL)) && + doesNotNeedToSchedule(VL) || + all_of(VL, + [&](Value *V) { + return S.isCopyableElement(V) || + doesNotNeedToBeScheduled(V); + })) && "Bundle and VL out of sync"); if (!Bundle.getBundle().empty()) { #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) auto *BundleMember = Bundle.getBundle().begin(); SmallPtrSet Processed; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V) || + !Processed.insert(V).second) continue; ++BundleMember; } @@ -4284,7 +4359,8 @@ class BoUpSLP { /// in general. ScalarsVectorizationLegality getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const; + const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const; /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. @@ -4996,7 +5072,8 @@ class BoUpSLP { /// Build a bundle from the ScheduleData nodes corresponding to the /// scalar instruction for each lane. - ScheduleBundle &buildBundle(ArrayRef VL); + ScheduleBundle &buildBundle(ArrayRef VL, + const InstructionsState &S); /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are @@ -7893,7 +7970,7 @@ void BoUpSLP::buildExternalUses( // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - if (!isa(Scalar)) + if (!isa(Scalar) || Entry->isCopyableElement(Scalar)) continue; // All uses must be replaced already? No need to do it again. auto It = ScalarToExtUses.find(Scalar); @@ -9617,7 +9694,8 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, PoisonValue::get(UniqueValues.front()->getType())); // Check that extended with poisons operations are still valid for // vectorization (div/rem are not allowed). - if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) { + if (!S.areInstructionsWithCopyableElements(PaddedUniqueValues) && + !getSameOpcode(PaddedUniqueValues, TLI).valid()) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); ReuseShuffleIndices.clear(); return false; @@ -9766,13 +9844,112 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, } namespace { -/// Class accepts incoming list of values and generates the list of values -/// for scheduling and list of operands for the new nodes. +/// Class accepts incoming list of values, checks if it is able to model +/// "copyable" values as compatible operations, and generates the list of values +/// for scheduling and list of operands doe the new nodes. class InstructionsCompatibilityAnalysis { DominatorTree &DT; const DataLayout &DL; const TargetTransformInfo &TTI; const TargetLibraryInfo &TLI; + unsigned MainOpcode = 0; + Instruction *MainOp = nullptr; + + /// Identifies the best candidate value, which represents main opcode + /// operation. + /// Currently the best candidate is the Add instruction with the parent + /// block with the highest DFS incoming number (block, that dominates other). + void findAndSetMainInstruction(ArrayRef VL) { + BasicBlock *Parent = nullptr; + // Checks if the instruction has supported opcode. + auto IsSupportedOpcode = [](Instruction *I) { + return I && I->getOpcode() == Instruction::Add; + }; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (!DT.isReachableFromEntry(I->getParent())) + continue; + if (!MainOp) { + MainOp = I; + Parent = I->getParent(); + continue; + } + if (Parent == I->getParent()) { + if (!IsSupportedOpcode(MainOp)) + MainOp = I; + if (MainOp->getOpcode() == I->getOpcode() && + doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I)) + MainOp = I; + continue; + } + auto *NodeA = DT.getNode(Parent); + auto *NodeB = DT.getNode(I->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == + (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) { + MainOp = I; + Parent = I->getParent(); + } + } + // FIXME: remove second part of the check, once the scheduling support + // for copyable instructions is landed. + if (!IsSupportedOpcode(MainOp) || any_of(VL, [&](Value *V) { + auto *I = dyn_cast(V); + return I && I->getOpcode() != MainOp->getOpcode() && + I->getParent() == MainOp->getParent() && !isa(I) && + !doesNotNeedToBeScheduled(I); + })) { + MainOp = nullptr; + return; + } + MainOpcode = MainOp->getOpcode(); + } + + /// Returns the idempotent value for the \p MainOp with the detected \p + /// MainOpcode. For Add, returns 0. For Or, it should choose between false and + /// the operand itself, since V or V == V. + Value *selectBestIdempotentValue() const { + switch (MainOpcode) { + case Instruction::Add: + return ConstantInt::getNullValue(MainOp->getType()); + default: + break; + } + llvm_unreachable("Unsupported opcode"); + } + + unsigned getNumberOfOperands() const { + switch (MainOpcode) { + case Instruction::Add: + return 2; + default: + break; + } + llvm_unreachable("Unsupported opcode"); + } + + /// Returns the value and operands for the \p V, considering if it is original + /// instruction and its actual operands should be returned, or it is a + /// copyable element and its should be represented as idempotent instruction. + SmallVector getOperands(const InstructionsState &S, Value *V) const { + bool MatchesMainOp = !S.isCopyableElement(V); + switch (MainOpcode) { + case Instruction::Add: + if (isa(V)) + return {V, V}; + if (MatchesMainOp) + return SmallVector(cast(V)->operands()); + return {V, selectBestIdempotentValue()}; + default: + break; + } + llvm_unreachable("Unsupported opcode"); + } /// Builds operands for the original instructions. void @@ -9933,22 +10110,122 @@ class InstructionsCompatibilityAnalysis { const TargetLibraryInfo &TLI) : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {} + InstructionsState + buildInstructionsState(ArrayRef VL, const BoUpSLP &R, + bool TryCopyableElementsVectorization, + bool WithProfitabilityCheck = false) { + InstructionsState S = getSameOpcode(VL, TLI); + if (S) + return S; + if (!VectorizeCopyableElements || !TryCopyableElementsVectorization) + return S; + findAndSetMainInstruction(VL); + if (!MainOp) + return InstructionsState::invalid(); + S = InstructionsState(MainOp, MainOp); + if (!WithProfitabilityCheck) + return S; + // Check if it is profitable to vectorize the instruction. + SmallVector Operands = buildOperands(S, VL); + if (VL.size() == 2) { + // Check if the operands allow better vectorization. + SmallVector, 4> Candidates; + Candidates.emplace_back(Operands[0][0], Operands[0][1]); + Candidates.emplace_back(Operands[1][0], Operands[1][1]); + if (isCommutative(MainOp)) { + Candidates.emplace_back(Operands[0][0], Operands[1][1]); + Candidates.emplace_back(Operands[1][0], Operands[0][1]); + } + // No good candidates - not profitable. + if (!R.findBestRootPair(Candidates, + BoUpSLP::LookAheadHeuristics::ScoreSplat)) { + // Deeper analysis for 2 splats/constants. + SmallVector, 4> Candidates1, Candidates2; + Candidates1.emplace_back(Operands[0][0], Operands[0][1]); + Candidates2.emplace_back(Operands[1][0], Operands[1][1]); + bool Res = R.findBestRootPair(Candidates1) && + R.findBestRootPair(Candidates2); + if (!Res && isCommutative(MainOp)) { + Candidates1.clear(); + Candidates2.clear(); + Candidates1.emplace_back(Operands[0][0], Operands[1][1]); + Candidates2.emplace_back(Operands[1][0], Operands[0][1]); + Res = R.findBestRootPair(Candidates1) && + R.findBestRootPair(Candidates2); + } + if (!Res) + return InstructionsState::invalid(); + } + } + assert(Operands.size() == 2 && "Unexpected number of operands!"); + unsigned CopyableNum = + count_if(VL, [&](Value *V) { return S.isCopyableElement(V); }); + if (CopyableNum <= VL.size() / 2) + return S; + // Check profitability if number of copyables > VL.size() / 2. + // 1. Reorder operands for better matching. + if (isCommutative(MainOp)) { + for (auto &Ops : Operands) { + // Make instructions the first operands. + if (isa(Ops.back())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + // Make constants the second operands. + if (isa(Ops.front())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + } + } + // 2. Check, if operands can be vectorized. + if (!allConstant(Operands.back())) + return InstructionsState::invalid(); + bool Res = allConstant(Operands.front()) || isSplat(Operands.front()); + if (!Res) { + // First operand not a constant or splat? Last attempt - check for + // potential vectorization. + InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI); + if (!Analysis.buildInstructionsState( + Operands.front(), R, + /*TryCopyableElementsVectorization=*/true)) + return InstructionsState::invalid(); + } + + return S; + } + SmallVector buildOperands(const InstructionsState &S, ArrayRef VL) { assert(S && "Invalid state!"); SmallVector Operands; - buildOriginalOperands(S, VL, Operands); + if (S.areInstructionsWithCopyableElements(VL)) { + MainOp = S.getMainOp(); + MainOpcode = S.getOpcode(); + Operands.assign(getNumberOfOperands(), + BoUpSLP::ValueList(VL.size(), nullptr)); + for (auto [Idx, V] : enumerate(VL)) { + SmallVector OperandsForValue = getOperands(S, V); + for (auto [OperandIdx, Operand] : enumerate(OperandsForValue)) + Operands[OperandIdx][Idx] = Operand; + } + } else { + buildOriginalOperands(S, VL, Operands); + } return Operands; } }; } // namespace -BoUpSLP::ScalarsVectorizationLegality -BoUpSLP::getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const { +BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( + ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); - InstructionsState S = getSameOpcode(VL, *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + VL, *this, TryCopyableElementsVectorization, + /*WithProfitabilityCheck=*/true); // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no @@ -10247,9 +10524,9 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, return true; }; - ScalarsVectorizationLegality Legality = - getScalarsVectorizationLegality(VL, Depth, UserTreeIdx); - const InstructionsState &S = Legality.getInstructionsState(); + ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false); + InstructionsState S = Legality.getInstructionsState(); if (!Legality.isLegal()) { if (Legality.trySplitVectorize()) { auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL); @@ -10257,11 +10534,18 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp))) return; } - if (Legality.tryToFindDuplicates()) - tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx); + if (!S) + Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true); + if (!Legality.isLegal()) { + if (Legality.tryToFindDuplicates()) + tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, + UserTreeIdx); - newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); - return; + newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); + return; + } + S = Legality.getInstructionsState(); } // FIXME: investigate if there are profitable cases for VL.size() <= 4. @@ -12906,7 +13190,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, unsigned Idx) const { ArrayRef VL = E->getOperand(Idx); - InstructionsState S = getSameOpcode(VL, *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + VL, *this, /*TryCopyableElementsVectorization=*/true); // Special processing for GEPs bundle, which may include non-gep values. if (!S && VL.front()->getType()->isPointerTy()) { const auto *It = find_if(VL, IsaPred); @@ -13040,7 +13326,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && - E->getMainOp()->getType()->isPointerTy())) && + E->getMainOp()->getType()->isPointerTy()) || + E->hasCopyableElements()) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -13052,6 +13339,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallBitVector UsedScalars(Sz, false); for (unsigned I = 0; I < Sz; ++I) { if (isa(UniqueValues[I]) && + !E->isCopyableElement(UniqueValues[I]) && getTreeEntries(UniqueValues[I]).front() == E) continue; UsedScalars.set(I); @@ -16048,6 +16336,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { auto *I = dyn_cast(V); if (!I) continue; + if (E->isCopyableElement(I)) + continue; if (FirstInst->getParent() == I->getParent()) { if (I->comesBefore(FirstInst)) FirstInst = I; @@ -16286,8 +16576,13 @@ Value *BoUpSLP::gather( UserOp = InsElt; } if (UserOp) { - unsigned FoundLane = Entries.front()->findLaneForValue(V); - ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane); + if (const auto *It = find_if_not( + Entries, + [&](const TreeEntry *TE) { return TE->isCopyableElement(V); }); + It != Entries.end()) { + unsigned FoundLane = Entries.front()->findLaneForValue(V); + ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane); + } } } } @@ -16925,7 +17220,9 @@ BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx, Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { ValueList &VL = E->getOperand(NodeIdx); - InstructionsState S = getSameOpcode(VL, *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + VL, *this, /*TryCopyableElementsVectorization=*/true); // Special processing for GEPs bundle, which may include non-gep values. if (!S && VL.front()->getType()->isPointerTy()) { const auto *It = find_if(VL, IsaPred); @@ -19213,7 +19510,7 @@ Value *BoUpSLP::vectorizeTree( if (auto *EE = dyn_cast(Scalar); EE && IgnoredExtracts.contains(EE)) continue; - if (isa(Scalar)) + if (!isa(Scalar) || Entry->isCopyableElement(Scalar)) continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); @@ -19455,12 +19752,15 @@ void BoUpSLP::optimizeGatherSequence() { } BoUpSLP::ScheduleBundle & -BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL) { +BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL, + const InstructionsState &S) { auto &BundlePtr = ScheduledBundlesList.emplace_back(std::make_unique()); for (Value *V : VL) { if (doesNotNeedToBeScheduled(V)) continue; + if (S.isCopyableElement(V)) + continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " "(maybe not in same basic block)"); @@ -19530,7 +19830,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it @@ -19547,7 +19847,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && @@ -19572,7 +19872,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, ReSchedule = true; } - ScheduleBundle &Bundle = buildBundle(VL); + ScheduleBundle &Bundle = buildBundle(VL, S); TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle.isReady()) { for (ScheduleData *BD : Bundle.getBundle()) { @@ -19589,7 +19889,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, } ScheduledBundlesList.pop_back(); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; ScheduledBundles.find(cast(V))->getSecond().pop_back(); } @@ -20218,7 +20518,7 @@ bool BoUpSLP::collectValuesToDemote( }; if (E.isGather() || !Visited.insert(&E).second || any_of(E.Scalars, [&](Value *V) { - return !isa(V) && all_of(V->users(), [&](User *U) { + return !isa(V) && all_of(V->users(), [&](User *U) { return isa(U) && !isVectorized(U); }); })) @@ -20684,7 +20984,12 @@ void BoUpSLP::computeMinimumValueSizes() { if (!IsKnownPositive) ++BitWidth1; - APInt Mask = DB->getDemandedBits(cast(Root)); + auto *I = dyn_cast(Root); + if (!I) { + MaxBitWidth = std::max(BitWidth1, MaxBitWidth); + continue; + } + APInt Mask = DB->getDemandedBits(I); unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); MaxBitWidth = std::max(std::min(BitWidth1, BitWidth2), MaxBitWidth); @@ -21013,7 +21318,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, for (Value *V : Chain) ValOps.insert(cast(V)->getValueOperand()); // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. - InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true); if (all_of(ValOps, IsaPred) && ValOps.size() > 1) { DenseSet Stores(Chain.begin(), Chain.end()); bool IsAllowedSize = diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index 7ed5f33c9dc6c..c2ca20d5aed5d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -4,9 +4,6 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ADD]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 @@ -17,7 +14,7 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> zeroinitializer, i64 4) ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll index 992909fb3e87f..5e3d4715e99c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll @@ -7,16 +7,10 @@ define i32 @test() { ; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]] ; CHECK: [[FUNC_135_EXIT_I]]: ; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v12i32(<16 x i32> poison, <12 x i32> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll index 382d6ae0e0a6f..6bb52e0fc43b3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll @@ -7,19 +7,17 @@ define void @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[IF_THEN_I_I:.*]]: -; CHECK-NEXT: br label %[[BB5:.*]] +; CHECK-NEXT: br label %[[BB3:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i1 false to i64 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> , <2 x i64> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> , <2 x i64> [[TMP2]], i64 2) -; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB2:.*]] -; CHECK: [[BB5]]: -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> , i64 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: br i1 false, label %[[BB3]], label %[[BB2:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ [[TMP2]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] ; CHECK-NEXT: br label %[[BB2]] ; CHECK: [[BB2]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB3]] ], [ [[TMP2]], %[[BB1]] ] ; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll index 42a50384787c8..1967c9028bef6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -7,13 +7,8 @@ target triple = "x86_64-unknown-linux-gnu" define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i64 0 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 -; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECINIT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VECINIT51:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: ret <4 x i32> [[VECINIT51]] ; %vecinit = insertelement <4 x i32> undef, i32 %f, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll index ad4daeab003f5..125c2dce32663 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll @@ -150,9 +150,9 @@ define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead) define <2 x i8> @replace_through_binop_fail_cant_speculate(i8 %inp, <2 x i8> %d, <2 x i8> %any) { ; CHECK-LABEL: define <2 x i8> @replace_through_binop_fail_cant_speculate( ; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[INP]], 5 -; CHECK-NEXT: [[V0:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i64 0 -; CHECK-NEXT: [[V:%.*]] = insertelement <2 x i8> [[V0]], i8 [[ADD]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[V:%.*]] = add <2 x i8> [[TMP2]], ; CHECK-NEXT: [[DIV0:%.*]] = sdiv <2 x i8> splat (i8 -128), [[V]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[INP]], 123 ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i8> [[DIV0]], i8 [[TMP1]], i64 0 From 8de5d0a3482090c3c8279a1ae222275182153d3e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 16 May 2025 16:45:44 +0000 Subject: [PATCH 2/4] Fix formatting Created using spr 1.3.5 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 97d6068571918..e260b6f87280b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10143,8 +10143,8 @@ class InstructionsCompatibilityAnalysis { SmallVector, 4> Candidates1, Candidates2; Candidates1.emplace_back(Operands[0][0], Operands[0][1]); Candidates2.emplace_back(Operands[1][0], Operands[1][1]); - bool Res = R.findBestRootPair(Candidates1) && - R.findBestRootPair(Candidates2); + bool Res = + R.findBestRootPair(Candidates1) && R.findBestRootPair(Candidates2); if (!Res && isCommutative(MainOp)) { Candidates1.clear(); Candidates2.clear(); @@ -16576,9 +16576,10 @@ Value *BoUpSLP::gather( UserOp = InsElt; } if (UserOp) { - if (const auto *It = find_if_not( - Entries, - [&](const TreeEntry *TE) { return TE->isCopyableElement(V); }); + if (const auto *It = find_if_not(Entries, + [&](const TreeEntry *TE) { + return TE->isCopyableElement(V); + }); It != Entries.end()) { unsigned FoundLane = Entries.front()->findLaneForValue(V); ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane); From 7cf73b248bc818e3112a4e916d6e7e990a6e8250 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 18 Jun 2025 22:52:42 +0000 Subject: [PATCH 3/4] Fix formatting Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4fedfff8c0668..b4650fa449403 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -970,31 +970,33 @@ class BinOpSameOpcodeHelper { return Instruction::Xor; llvm_unreachable("Cannot find interchangeable instruction."); } + /// Return true if the instruction can be converted to \p Opcode. bool hasCandidateOpcode(unsigned Opcode) const { MaskType Candidate = Mask & SeenBefore; switch (Opcode) { - case Instruction::Shl: - return Candidate & ShlBIT; - case Instruction::AShr: - return Candidate & AShrBIT; - case Instruction::Mul: - return Candidate & MulBIT; - case Instruction::Add: - return Candidate & AddBIT; - case Instruction::Sub: - return Candidate & SubBIT; - case Instruction::And: - return Candidate & AndBIT; - case Instruction::Or: - return Candidate & OrBIT; - case Instruction::Xor: - return Candidate & XorBIT; - default: + case Instruction::Shl: + return Candidate & ShlBIT; + case Instruction::AShr: + return Candidate & AShrBIT; + case Instruction::Mul: + return Candidate & MulBIT; + case Instruction::Add: + return Candidate & AddBIT; + case Instruction::Sub: + return Candidate & SubBIT; + case Instruction::And: + return Candidate & AndBIT; + case Instruction::Or: + return Candidate & OrBIT; + case Instruction::Xor: + return Candidate & XorBIT; + default: break; } llvm_unreachable("Cannot find interchangeable instruction."); } + SmallVector getOperand(const Instruction *To) const { unsigned ToOpcode = To->getOpcode(); unsigned FromOpcode = I->getOpcode(); From 8cc731d09bb27c4b07c8d0923dc125b709868cfb Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 17 Jul 2025 15:31:45 +0000 Subject: [PATCH 4/4] Use ConstantExpr::getBinOpIdentity Created using spr 1.3.5 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 617453b0c78a1..7c6f57dc3c97b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9959,13 +9959,9 @@ class InstructionsCompatibilityAnalysis { /// MainOpcode. For Add, returns 0. For Or, it should choose between false and /// the operand itself, since V or V == V. Value *selectBestIdempotentValue() const { - switch (MainOpcode) { - case Instruction::Add: - return ConstantInt::getNullValue(MainOp->getType()); - default: - break; - } - llvm_unreachable("Unsupported opcode"); + assert(MainOpcode == Instruction::Add && "Unsupported opcode"); + return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(), + !MainOp->isCommutative()); } /// Returns the value and operands for the \p V, considering if it is original