diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index b2fced47b9527..9bde5d96edf90 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3055,6 +3055,47 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) { return true; } +/// Returns true if this ShuffleVectorInst eventually feeds into a +/// vector reduction intrinsic (e.g., vector_reduce_add) by only following +/// chains of shuffles and binary operators (in any combination/order). +static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) { + SmallPtrSet Visited; + SmallVector WorkList; + bool FoundReduction = false; + + WorkList.push_back(SVI); + while (!WorkList.empty()) { + Instruction *I = WorkList.pop_back_val(); + for (User *U : I->users()) { + auto *UI = dyn_cast(U); + if (!UI || !Visited.insert(UI).second) + continue; + if (auto *II = dyn_cast(UI)) { + // More than one reduction reached + if (FoundReduction) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + FoundReduction = true; + continue; + default: + return false; + } + } + + if (!isa(UI) && !isa(UI)) + return false; + + WorkList.emplace_back(UI); + } + } + return FoundReduction; +} + /// This method looks for groups of shuffles acting on binops, of the form: /// %x = shuffle ... /// %y = shuffle ... @@ -3297,6 +3338,65 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind); }; + unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits(); + unsigned MaxVectorSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); + unsigned MaxElementsInVector = MaxVectorSize / ElementSize; + // When there are multiple shufflevector operations on the same input, + // especially when the vector length is larger than the register size, + // identical shuffle patterns may occur across different groups of elements. + // To avoid overestimating the cost by counting these repeated shuffles more + // than once, we only account for unique shuffle patterns. This adjustment + // prevents inflated costs in the cost model for wide vectors split into + // several register-sized groups. + std::set> UniqueShuffles; + auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef Mask) { + // Compute the cost for performing the shuffle over the full vector. + auto ShuffleCost = + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind); + unsigned NumFullVectors = Mask.size() / MaxElementsInVector; + if (NumFullVectors < 2) + return C + ShuffleCost; + SmallVector SubShuffle(MaxElementsInVector); + unsigned NumUniqueGroups = 0; + unsigned NumGroups = Mask.size() / MaxElementsInVector; + // For each group of MaxElementsInVector contiguous elements, + // collect their shuffle pattern and insert into the set of unique patterns. + for (unsigned k = 0; k < NumFullVectors; ++k) { + for (unsigned l = 0; l < MaxElementsInVector; ++l) + SubShuffle[l] = Mask[MaxElementsInVector * k + l]; + if (UniqueShuffles.insert(SubShuffle).second) + NumUniqueGroups += 1; + } + return C + ShuffleCost * NumUniqueGroups / NumGroups; + }; + auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) { + auto *SV = dyn_cast(I); + if (!SV) + return C; + SmallVector Mask; + SV->getShuffleMask(Mask); + return AddShuffleMaskAdjustedCost(C, Mask); + }; + // Check that input consists of ShuffleVectors applied to the same input + auto AllShufflesHaveSameOperands = + [](SmallPtrSetImpl &InputShuffles) { + if (InputShuffles.size() < 2) + return false; + ShuffleVectorInst *FirstSV = + dyn_cast(*InputShuffles.begin()); + if (!FirstSV) + return false; + + Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1); + return std::all_of( + std::next(InputShuffles.begin()), InputShuffles.end(), + [&](Instruction *I) { + ShuffleVectorInst *SV = dyn_cast(I); + return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1; + }); + }; + // Get the costs of the shuffles + binops before and after with the new // shuffle masks. InstructionCost CostBefore = @@ -3304,8 +3404,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); - CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), - InstructionCost(0), AddShuffleCost); + if (AllShufflesHaveSameOperands(InputShuffles)) { + UniqueShuffles.clear(); + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleAdjustedCost); + } else { + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleCost); + } // The new binops will be unused for lanes past the used shuffle lengths. // These types attempt to get the correct cost for that from the target. @@ -3316,8 +3422,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { InstructionCost CostAfter = TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) + TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind); + UniqueShuffles.clear(); CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), - InstructionCost(0), AddShuffleMaskCost); + InstructionCost(0), AddShuffleMaskAdjustedCost); std::set> OutputShuffleMasks({V1A, V1B, V2A, V2B}); CostAfter += std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), @@ -3326,7 +3433,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore << " vs CostAfter: " << CostAfter << "\n"); - if (CostBefore <= CostAfter) + if (CostBefore < CostAfter || CostBefore == 0 || + (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI))) return false; // The cost model has passed, create the new instructions. diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 26573a3e613da..a2019836098e8 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -80,29 +80,33 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]] ; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]] -; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP51]] -; CHECK-NEXT: [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP51]] -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP52]], <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP54]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP55]] -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15) -; CHECK-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537) -; CHECK-NEXT: [[TMP67:%.*]] = mul nuw <16 x i32> [[TMP66]], splat (i32 65535) -; CHECK-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP64]] -; CHECK-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP67]] -; CHECK-NEXT: [[TMP70:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP70]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP70]], 16 +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]] +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], splat (i32 15) +; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], splat (i32 65537) +; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], splat (i32 65535) +; CHECK-NEXT: [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP74]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP74]], 16 ; CHECK-NEXT: [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[RDD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]