Skip to content

[VectorCombine] Refine cost model and decision logic in foldSelectShuffle #146694

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 112 additions & 4 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3055,6 +3055,47 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
return true;
}

/// Returns true if this ShuffleVectorInst eventually feeds into a
/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
/// chains of shuffles and binary operators (in any combination/order).
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
SmallPtrSet<Instruction *, 8> Visited;
SmallVector<Instruction *, 4> WorkList;
bool FoundReduction = false;

WorkList.push_back(SVI);
while (!WorkList.empty()) {
Instruction *I = WorkList.pop_back_val();
for (User *U : I->users()) {
auto *UI = dyn_cast<Instruction>(U);
if (!UI || !Visited.insert(UI).second)
continue;
if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
// More than one reduction reached
if (FoundReduction)
return false;
switch (II->getIntrinsicID()) {
case Intrinsic::vector_reduce_add:
case Intrinsic::vector_reduce_mul:
case Intrinsic::vector_reduce_and:
case Intrinsic::vector_reduce_or:
case Intrinsic::vector_reduce_xor:
FoundReduction = true;
continue;
default:
return false;
}
}

if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
return false;

WorkList.emplace_back(UI);
}
}
return FoundReduction;
}

/// This method looks for groups of shuffles acting on binops, of the form:
/// %x = shuffle ...
/// %y = shuffle ...
Expand Down Expand Up @@ -3297,15 +3338,80 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
};

unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
unsigned MaxVectorSize =
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
// When there are multiple shufflevector operations on the same input,
// especially when the vector length is larger than the register size,
// identical shuffle patterns may occur across different groups of elements.
// To avoid overestimating the cost by counting these repeated shuffles more
// than once, we only account for unique shuffle patterns. This adjustment
// prevents inflated costs in the cost model for wide vectors split into
// several register-sized groups.
std::set<SmallVector<int, 4>> UniqueShuffles;
auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
// Compute the cost for performing the shuffle over the full vector.
auto ShuffleCost =
TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
if (NumFullVectors < 2)
return C + ShuffleCost;
SmallVector<int, 4> SubShuffle(MaxElementsInVector);
unsigned NumUniqueGroups = 0;
unsigned NumGroups = Mask.size() / MaxElementsInVector;
// For each group of MaxElementsInVector contiguous elements,
// collect their shuffle pattern and insert into the set of unique patterns.
for (unsigned k = 0; k < NumFullVectors; ++k) {
for (unsigned l = 0; l < MaxElementsInVector; ++l)
SubShuffle[l] = Mask[MaxElementsInVector * k + l];
if (UniqueShuffles.insert(SubShuffle).second)
NumUniqueGroups += 1;
}
return C + ShuffleCost * NumUniqueGroups / NumGroups;
};
auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
auto *SV = dyn_cast<ShuffleVectorInst>(I);
if (!SV)
return C;
SmallVector<int, 16> Mask;
SV->getShuffleMask(Mask);
return AddShuffleMaskAdjustedCost(C, Mask);
};
// Check that input consists of ShuffleVectors applied to the same input
auto AllShufflesHaveSameOperands =
[](SmallPtrSetImpl<Instruction *> &InputShuffles) {
if (InputShuffles.size() < 2)
return false;
ShuffleVectorInst *FirstSV =
dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
if (!FirstSV)
return false;

Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
return std::all_of(
std::next(InputShuffles.begin()), InputShuffles.end(),
[&](Instruction *I) {
ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
});
};

// Get the costs of the shuffles + binops before and after with the new
// shuffle masks.
InstructionCost CostBefore =
TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
InstructionCost(0), AddShuffleCost);
CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
InstructionCost(0), AddShuffleCost);
if (AllShufflesHaveSameOperands(InputShuffles)) {
UniqueShuffles.clear();
CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
InstructionCost(0), AddShuffleAdjustedCost);
} else {
CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
InstructionCost(0), AddShuffleCost);
}

// The new binops will be unused for lanes past the used shuffle lengths.
// These types attempt to get the correct cost for that from the target.
Expand All @@ -3316,8 +3422,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
InstructionCost CostAfter =
TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
UniqueShuffles.clear();
CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
InstructionCost(0), AddShuffleMaskCost);
InstructionCost(0), AddShuffleMaskAdjustedCost);
std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
CostAfter +=
std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
Expand All @@ -3326,7 +3433,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
<< " vs CostAfter: " << CostAfter << "\n");
if (CostBefore <= CostAfter)
if (CostBefore < CostAfter || CostBefore == 0 ||
(CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
return false;

// The cost model has passed, create the new instructions.
Expand Down
50 changes: 27 additions & 23 deletions llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,29 +80,33 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]]
; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]]
; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 3, i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 7>
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 17, i32 1, i32 16, i32 0, i32 19, i32 3, i32 18, i32 2, i32 21, i32 5, i32 20, i32 4, i32 23, i32 7, i32 22, i32 6>
; CHECK-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP51]]
; CHECK-NEXT: [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP51]]
; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP54]], [[TMP55]]
; CHECK-NEXT: [[TMP57:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP55]]
; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]]
; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]]
; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15)
; CHECK-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537)
; CHECK-NEXT: [[TMP67:%.*]] = mul nuw <16 x i32> [[TMP66]], splat (i32 65535)
; CHECK-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP64]]
; CHECK-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP67]]
; CHECK-NEXT: [[TMP70:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP70]], 65535
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP70]], 16
; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]]
; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]]
; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], splat (i32 15)
; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], splat (i32 65537)
; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], splat (i32 65535)
; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]]
; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]]
; CHECK-NEXT: [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]])
; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16
; CHECK-NEXT: [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[RDD119]], 1
; CHECK-NEXT: ret i32 [[SHR120]]
Expand Down
Loading