From 92b3a6ab8bb66bd58f0de9885e0656ae8fe3cb3d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 14 Aug 2024 18:35:47 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 345 ++++++++--------- .../PhaseOrdering/AArch64/slpordering.ll | 74 ++-- .../SLPVectorizer/AArch64/getelementptr.ll | 11 +- .../SLPVectorizer/AArch64/loadorder.ll | 192 ++++----- .../AArch64/multiple_reduction.ll | 365 +++++++----------- .../AArch64/scalarization-overhead.ll | 62 ++- .../AArch64/shuffle-vectors-mask-size.ll | 7 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 8 +- .../vectorizable-selects-uniform-cmps.ll | 32 +- .../RISCV/combined-loads-stored.ll | 7 +- .../SLPVectorizer/RISCV/reductions.ll | 48 ++- .../SLPVectorizer/SystemZ/pr34619.ll | 11 +- .../Transforms/SLPVectorizer/X86/addsub.ll | 18 +- .../X86/extract-many-users-buildvector.ll | 43 +-- .../X86/extract-scalar-from-undef.ll | 27 +- .../X86/gather-node-same-as-vect-but-order.ll | 13 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 16 +- .../SLPVectorizer/X86/inst_size_bug.ll | 18 +- .../SLPVectorizer/X86/landing_pad.ll | 19 +- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 53 +-- .../SLPVectorizer/X86/reduction-logical.ll | 17 +- .../X86/remark-partial-loads-vectorize.ll | 16 +- .../X86/scatter-vectorize-reused-pointer.ll | 26 +- .../X86/schedule_budget_debug_info.ll | 40 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 39 +- .../Transforms/SLPVectorizer/X86/tiny-tree.ll | 5 +- .../X86/vect-gather-same-nodes.ll | 6 +- 27 files changed, 722 insertions(+), 796 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 81841a8f69287..b32017adcf8ca 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3104,6 +3104,10 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. int Idx = -1; + /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from + /// other nodes as a serie of insertvector instructions. + SmallVector, 0> CombinedEntriesWithIndices; + private: /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the @@ -3404,7 +3408,9 @@ class BoUpSLP { if (!isConstant(V)) { auto *I = dyn_cast(V); AllConstsOrCasts &= I && I->getType()->isIntegerTy(); - ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); + if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE || + !UserTreeIdx.UserTE->isGather()) + ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } if (AllConstsOrCasts) CastMaxMinBWSizes = @@ -8361,8 +8367,49 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - for (std::unique_ptr &TE : VectorizableTree) { - TreeEntry &E = *TE; + // The tree may grow here, so iterate over nodes, built before. + for (unsigned Idx : seq(VectorizableTree.size())) { + TreeEntry &E = *VectorizableTree[Idx]; + if (E.isGather()) { + ArrayRef VL = E.Scalars; + const unsigned Sz = getVectorElementSize(VL.front()); + unsigned MinVF = getMinVF(2 * Sz); + if (VL.size() <= 2 || + (E.getOpcode() && + (E.isAltShuffle() || E.getOpcode() != Instruction::Load))) + continue; + // Try to find vectorizable sequences and transform them into a serie of + // insertvector instructions. + unsigned StartIdx = 0; + unsigned End = VL.size(); + for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + InstructionsState S = getSameOpcode(Slice, *TLI); + if (!S.getOpcode() || S.isAltShuffle() || + (S.getOpcode() != Instruction::Load && + any_of(Slice, [&](Value *V) { + return !areAllUsersVectorized(cast(V), + UserIgnoreList); + }))) + continue; + if (!getTreeEntry(Slice.front()) && !getTreeEntry(Slice.back())) { + unsigned PrevSize = VectorizableTree.size(); + buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); + if (PrevSize + 1 == VectorizableTree.size() && + VectorizableTree[PrevSize]->isGather()) { + VectorizableTree.pop_back(); + continue; + } + E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt); + if (StartIdx == Cnt) + StartIdx = Cnt + VF; + if (End == Cnt + VF) + End = Cnt; + } + } + } + } switch (E.getOpcode()) { case Instruction::Load: { // No need to reorder masked gather loads, just reorder the scalar @@ -8485,175 +8532,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *VecTy = getWidenedType(ScalarTy, VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - InstructionsState S = getSameOpcode(VL, *R.TLI); - const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy); - unsigned MinVF = R.getMinVF(2 * Sz); - if (VL.size() > 2 && - ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || - (InVectors.empty() && - any_of(seq(0, VL.size() / MinVF), - [&](unsigned Idx) { - ArrayRef SubVL = VL.slice(Idx * MinVF, MinVF); - InstructionsState S = getSameOpcode(SubVL, *R.TLI); - return S.getOpcode() == Instruction::Load && - !S.isAltShuffle(); - }))) && - !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && - !isSplat(Gathers)) { - InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy); - SetVector VectorizedLoads; - SmallVector> VectorizedStarts; - SmallVector ScatterVectorized; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - for (; VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) { - InstructionsState SliceS = getSameOpcode(Slice, *R.TLI); - if (SliceS.getOpcode() != Instruction::Load || - SliceS.isAltShuffle()) - continue; - } - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(), - CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - case LoadsState::StridedVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - // TODO: better handling of loads with reorders. - if (((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize) && - CurrentOrder.empty()) || - (LS == LoadsState::StridedVectorize && - isReverseOrder(CurrentOrder))) - VectorizedStarts.emplace_back(Cnt, LS); - else - ScatterVectorized.push_back(Cnt); - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize - // it again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; - } - if (!VectorizedLoads.empty()) { - unsigned NumParts = TTI.getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += - getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); - } - // Exclude potentially vectorized loads from list of gathered - // scalars. - Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType())); - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += - TTI.getMemoryOpCost(Instruction::Load, LI->getType(), - LI->getAlign(), LI->getPointerAddressSpace(), - CostKind, TTI::OperandValueInfo(), LI); - } - auto *LoadTy = getWidenedType(VL.front()->getType(), VF); - for (const std::pair &P : VectorizedStarts) { - auto *LI = cast(VL[P.first]); - Align Alignment = LI->getAlign(); - GatherCost += - P.second == LoadsState::Vectorize - ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI) - : TTI.getStridedMemoryOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - // Add external uses costs. - for (auto [Idx, V] : enumerate(VL.slice( - P.first, std::min(VL.size() - P.first, VF)))) - if (!R.areAllUsersVectorized(cast(V))) - GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement, - LoadTy, CostKind, Idx); - // Estimate GEP cost. - SmallVector PointerOps(VF); - for (auto [I, V] : enumerate(VL.slice(P.first, VF))) - PointerOps[I] = cast(V)->getPointerOperand(); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), - Instruction::Load, CostKind, LI->getType(), LoadTy); - GatherCost += VectorGEPCost - ScalarGEPCost; - } - for (unsigned P : ScatterVectorized) { - auto *LI0 = cast(VL[P]); - ArrayRef Slice = VL.slice(P, VF); - Align CommonAlignment = computeCommonAlignment(Slice); - GatherCost += TTI.getGatherScatterOpCost( - Instruction::Load, LoadTy, LI0->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind, LI0); - // Estimate GEP cost. - SmallVector PointerOps(VF); - for (auto [I, V] : enumerate(Slice)) - PointerOps[I] = cast(V)->getPointerOperand(); - OrdersType Order; - if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE, - Order)) { - // TODO: improve checks if GEPs can be vectorized. - Value *Ptr0 = PointerOps.front(); - Type *ScalarTy = Ptr0->getType(); - auto *VecTy = getWidenedType(ScalarTy, VF); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr, - CostKind, ScalarTy, VecTy); - GatherCost += VectorGEPCost - ScalarGEPCost; - if (!Order.empty()) { - SmallVector Mask; - inversePermutation(Order, Mask); - GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, - VecTy, Mask, CostKind); - } - } else { - GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true, - PointerOps.front()->getType()); - } - } - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - SmallVector ShuffleMask(VL.size()); - for (unsigned I = VF, E = VL.size(); I < E; I += VF) { - for (unsigned Idx : seq(0, E)) - ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx; - GatherCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, - ShuffleMask, CostKind, I, LoadTy); - } - } - GatherCost -= ScalarsCost; - } - GatherCost = std::min(BaseCost, GatherCost); - } else if (!Root && isSplat(VL)) { + if (!Root && isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as // the broadcast. const auto *It = find_if_not(VL, IsaPred); @@ -9401,7 +9280,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. InstructionCost - finalize(ArrayRef ExtMask, unsigned VF = 0, + finalize(ArrayRef ExtMask, + ArrayRef> SubVectors, + unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; if (Action) { @@ -9419,6 +9300,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Action(V, CommonMask); InVectors.front() = V; } + if (!SubVectors.empty()) { + const PointerUnion &Vec = InVectors.front(); + if (InVectors.size() == 2) + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + else + Cost += createShuffle(Vec, nullptr, CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + for (const auto [E, Idx] : SubVectors) { + Cost += ::getShuffleCost( + TTI, TTI::SK_InsertSubvector, + FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, + CostKind, Idx, + FixedVectorType::get(ScalarTy, E->getVectorFactor())); + if (!CommonMask.empty()) { + std::iota(std::next(CommonMask.begin(), Idx), + std::next(CommonMask.begin(), Idx + E->getVectorFactor()), + Idx); + } + } + } + ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); if (CommonMask.empty()) { assert(InVectors.size() == 1 && "Expected only one vector with no mask"); @@ -10942,8 +10846,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { if (CanBeUsedAsScalar) { InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind); bool KeepScalar = ScalarCost <= ExtraCost; - if (KeepScalar && ScalarCost != TTI::TCC_Free && - ExtraCost - ScalarCost <= TTI::TCC_Basic) { + // Try to keep original scalar if the user is the phi node from the same + // block as the root phis, currently vectorized. It allows to keep + // better ordering info of PHIs, being vectorized currently. + bool IsProfitablePHIUser = + (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic && + VectorizableTree.front()->Scalars.size() > 2)) && + VectorizableTree.front()->getOpcode() == Instruction::PHI && + !Inst->hasNUsesOrMore(UsesLimit) && + none_of(Inst->users(), + [&](User *U) { + auto *PHIUser = dyn_cast(U); + return (!PHIUser || + PHIUser->getParent() != + cast( + VectorizableTree.front()->getMainOp()) + ->getParent()) && + !getTreeEntry(U); + }) && + count_if(Entry->Scalars, [&](Value *V) { + return ValueToExtUses->contains(V); + }) <= 2; + if (IsProfitablePHIUser) { + KeepScalar = true; + } else if (KeepScalar && ScalarCost != TTI::TCC_Free && + ExtraCost - ScalarCost <= TTI::TCC_Basic) { unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) { return ValueToExtUses->contains(V); }); @@ -12490,7 +12417,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { /// \param Action the action (if any) to be performed before final applying of /// the \p ExtMask mask. Value * - finalize(ArrayRef ExtMask, unsigned VF = 0, + finalize(ArrayRef ExtMask, + ArrayRef> SubVectors, + unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; SmallVector NewExtMask(ExtMask); @@ -12524,6 +12453,29 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Action(Vec, CommonMask); InVectors.front() = Vec; } + if (!SubVectors.empty()) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Vec = createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Vec = createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + for (const auto [E, Idx] : SubVectors) { + Vec = Builder.CreateInsertVector( + Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx)); + if (!CommonMask.empty()) { + std::iota(std::next(CommonMask.begin(), Idx), + std::next(CommonMask.begin(), Idx + E->getVectorFactor()), + Idx); + } + } + InVectors.front() = Vec; + } + if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); @@ -12602,7 +12554,10 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, : ScalarTy, Builder, *this); ShuffleBuilder.add(V, Mask); - return ShuffleBuilder.finalize(std::nullopt); + SmallVector> SubVectors; + for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + SubVectors.emplace_back(VectorizableTree[EIdx].get(), Idx); + return ShuffleBuilder.finalize(std::nullopt, SubVectors); }; Value *V = vectorizeTree(VE, PostponedPHIs); if (VF * getNumElements(VL[0]->getType()) != @@ -12685,6 +12640,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, SmallVector ReuseShuffleIndices(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Clear values, to be replaced by insertvector instructions. + for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + for_each(MutableArrayRef(GatheredScalars) + .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), + [&](Value *&V) { V = PoisonValue::get(V->getType()); }); + SmallVector> SubVectors; + for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + SubVectors.emplace_back(VectorizableTree[EIdx].get(), Idx); // Build a mask out of the reorder indices and reorder scalars per this // mask. SmallVector ReorderMask; @@ -12822,7 +12785,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask()); + Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors); return Res; } if (!Resized) { @@ -13079,10 +13042,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, (IsSingleShuffle && ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) && isa(V)); })) - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); else Res = ShuffleBuilder.finalize( - E->ReuseShuffleIndices, E->Scalars.size(), + E->ReuseShuffleIndices, SubVectors, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); @@ -13093,7 +13056,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); } else { // Gather all constants. SmallVector Mask(GatheredScalars.size(), PoisonMaskElem); @@ -13103,7 +13066,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, Mask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); } if (NeedFreeze) @@ -13112,6 +13075,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { + for (const auto [EIdx, _] : E->CombinedEntriesWithIndices) + (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); } @@ -13163,7 +13128,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); } - return ShuffleBuilder.finalize(E->ReuseShuffleIndices); + SmallVector> SubVectors; + for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + SubVectors.emplace_back(VectorizableTree[EIdx].get(), Idx); + return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); }; assert(!E->isGather() && "Unhandled state"); @@ -14566,7 +14534,7 @@ Value *BoUpSLP::vectorizeTree( ShuffleBuilder.add(V1, CombinedMask1); if (V2) ShuffleBuilder.add(V2, CombinedMask2); - return ShuffleBuilder.finalize(std::nullopt); + return ShuffleBuilder.finalize(std::nullopt, std::nullopt); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, @@ -14704,7 +14672,14 @@ Value *BoUpSLP::vectorizeTree( // Clear up reduction references, if any. if (UserIgnoreList) { for (Instruction *I : RemovedInsts) { - if (getTreeEntry(I)->Idx != 0) + const TreeEntry *IE = getTreeEntry(I); + if (IE->Idx != 0 && + !(VectorizableTree.front()->isGather() && isa(I) && + !IE->UserTreeIndices.empty() && + any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.UserTE == VectorizableTree.front().get() && + EI.EdgeIdx == UINT_MAX; + }))) continue; SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 22511c018dca2..2121775224098 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -18,62 +18,62 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64 ; CHECK-NEXT: [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index c1cef6ff3d10b..91c8db14a45aa 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -169,11 +169,12 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T10]], i64 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T12]], i64 3 -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP13]], [[SUM_032]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index d79aed89b0be7..5b878108af59a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -340,12 +340,12 @@ entry: define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -416,31 +416,31 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]]) ; CHECK-NEXT: ret i32 [[TMP21]] ; @@ -677,63 +677,63 @@ entry: define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1 -; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2 +; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 -; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 8 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 +; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4 ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, ptr [[Z:%.*]], i64 4 -; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]] +; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 24 -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]] -; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 +; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 40 -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] +; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44 +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4 +; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4 -; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 -; CHECK-NEXT: store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -833,12 +833,12 @@ entry: define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride, ptr %dst0) { ; CHECK-LABEL: @store_blockstrided4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -1203,62 +1203,62 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index d89d628670360..07411cacb3626 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,232 +14,161 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 ; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 ; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 ; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 -; CHECK-NEXT: [[CONV_7_7:%.*]] = zext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP10]], <64 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i16> [[TMP11]], <64 x i16> [[TMP12]], <64 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP13]], <64 x i16> [[TMP14]], <64 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP16]], <64 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i16> [[TMP17]], <64 x i16> [[TMP18]], <64 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i16> [[TMP19]], <64 x i16> [[TMP20]], <64 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <64 x i16> [[TMP21]], <64 x i16> [[TMP22]], <64 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = zext <64 x i16> [[TMP23]] to <64 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 -; CHECK-NEXT: [[CONV_6_7:%.*]] = zext i16 [[TMP25]] to i32 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 -; CHECK-NEXT: [[CONV_5_7:%.*]] = zext i16 [[TMP26]] to i32 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 -; CHECK-NEXT: [[CONV_4_7:%.*]] = zext i16 [[TMP27]] to i32 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 -; CHECK-NEXT: [[CONV_3_7:%.*]] = zext i16 [[TMP28]] to i32 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 -; CHECK-NEXT: [[CONV_2_7:%.*]] = zext i16 [[TMP29]] to i32 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 -; CHECK-NEXT: [[CONV_1_7:%.*]] = zext i16 [[TMP30]] to i32 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 -; CHECK-NEXT: [[CONV_764:%.*]] = zext i16 [[TMP31]] to i32 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 -; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP32]] to i32 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 -; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP33]] to i32 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 -; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP34]] to i32 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 -; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP35]] to i32 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 -; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP36]] to i32 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 -; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP37]] to i32 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 -; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP38]] to i32 -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP39]] to i32 -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 -; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP40]] to i32 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 -; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP41]] to i32 -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 -; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP42]] to i32 -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 -; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP43]] to i32 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP44]] to i32 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 -; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP45]] to i32 -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 -; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP46]] to i32 -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 -; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP47]] to i32 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 -; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP48]] to i32 -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 -; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP49]] to i32 -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 -; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP50]] to i32 -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 -; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP51]] to i32 -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP53]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 -; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP54]] to i32 -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 -; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP55]] to i32 -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP56]] to i32 -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP57]] to i32 -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP58]] to i32 -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP59]] to i32 -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP60]] to i32 -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP61]] to i32 -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP62]] to i32 -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP63]] to i32 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP64]] to i32 -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 -; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP65]] to i32 -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 -; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP66]] to i32 -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 -; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP67]] to i32 -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 -; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP68]] to i32 -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 -; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP69]] to i32 -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 -; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP70]] to i32 -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 -; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP71]] to i32 -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 -; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP72]] to i32 -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 -; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP73]] to i32 -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 -; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP74]] to i32 -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 -; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP75]] to i32 -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 -; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP76]] to i32 -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 -; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP77]] to i32 -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 -; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP78]] to i32 -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 -; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP79]] to i32 -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP80]] to i32 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 -; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP81]] to i32 -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 -; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP82]] to i32 -; CHECK-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 -; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP83]] to i32 -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP84]] to i32 -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP85]] to i32 -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP86]] to i32 -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP87]] to i32 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] -; CHECK-NEXT: [[TMP88:%.*]] = mul nuw nsw <64 x i32> [[TMP24]], [[TMP24]] -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] -; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]] -; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]] -; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]] -; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]] -; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]] -; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]] -; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]] -; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]] -; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP88]]) +; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16) +; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24) +; CHECK-NEXT: [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40) +; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48) +; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56) +; CHECK-NEXT: [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2 +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3 +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5 +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6 +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7 +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8 +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9 +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10 +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11 +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12 +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13 +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14 +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15 +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16 +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17 +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18 +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19 +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]] +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20 +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21 +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22 +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23 +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24 +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]] +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25 +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26 +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27 +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28 +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29 +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30 +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31 +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]] +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32 +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33 +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34 +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35 +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]] +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36 +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]] +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37 +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]] +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38 +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]] +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39 +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]] +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40 +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41 +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]] +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42 +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]] +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43 +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]] +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44 +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]] +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45 +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46 +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]] +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47 +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]] +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48 +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49 +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50 +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51 +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]] +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52 +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]] +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53 +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]] +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54 +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]] +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55 +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56 +; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]] +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57 +; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]] +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58 +; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59 +; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]] +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60 +; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]] +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61 +; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]] +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62 +; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63 +; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]] +; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]]) ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 -; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP89]] to i64 +; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP82]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 ; CHECK-NEXT: [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]] ; CHECK-NEXT: ret i64 [[ADD17]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 6f6b66255a434..8093285ad8717 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -3,39 +3,63 @@ ; Test case reported on D134605 where the vectorization was causing a slowdown due to an underestimation in the cost of the extractions. +; NOTE: cost of shuffle <4 x float>, <4 x float>, <2 x i32> is 12! + define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] +; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 +; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 ; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], +; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 +; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 +; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 -; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 -; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00 +; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] +; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] +; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] +; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] +; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 +; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] +; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] +; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] +; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] +; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = uitofp <4 x i8> [[TMP11]] to <4 x float> -; CHECK-NEXT: [[TMP13:%.*]] = fsub fast <4 x float> [[TMP12]], [[TMP3]] -; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]]) +; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float +; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 +; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 +; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float +; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 +; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 +; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float +; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 +; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 +; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float +; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] +; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] +; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] +; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] +; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] +; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] +; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] +; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] +; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] +; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] +; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP15]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll index e39cd8aaa111b..4f88182374622 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll @@ -7,16 +7,13 @@ define void @p(double %0) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[MUL16_150_1_I:%.*]] = fmul double 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP14]], double [[MUL16_150_1_I]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> , <2 x double> [[TMP7]], i64 2) ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index 95aa40f664c0c..ff1d6253bec92 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -25,11 +25,11 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] ; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index b59659ca75eb2..f04c359b432b5 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -241,12 +241,9 @@ entry: define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_16xi8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i8 8 ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 -; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 ; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 @@ -254,19 +251,28 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]] -; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[PTR]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]] +; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll index 94a55c435c8c3..cd79250e8fb6b 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll @@ -4,12 +4,11 @@ define void @test(ptr noalias %p, ptr %p1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP3]], <2 x i16> [[TMP2]], i64 2) ; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[P1]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index ff3d2c4c59394..151b91184bf42 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1013,22 +1013,20 @@ declare i32 @llvm.abs.i32(i32, i1) define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-LABEL: @stride_sum_abs_diff( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]] -; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]] +; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] +; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true) -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: ret i32 [[TMP13]] +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: ret i32 [[TMP11]] ; %x.0 = load i32, ptr %p %y.0 = load i32, ptr %q @@ -1068,12 +1066,11 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %x.0 = load i8, ptr %p, align 1 @@ -1117,12 +1114,11 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %0 = load i8, ptr %x, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll index 0fcbead65d0d6..413aedefe9b6a 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -13,12 +13,11 @@ define void @foo() local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], -; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP2]], <2 x i32> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[ARRAYIDX372]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll index f7bd2431a7605..96b498ced7d0f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -392,16 +392,14 @@ define void @vec_shuff_reorder() #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP13]], ptr @fc, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP11]], ptr @fc, align 4 ; CHECK-NEXT: ret void ; %1 = load float, ptr @fb, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll index 3b03ca13ea65d..87b1302e4cecf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll @@ -6,30 +6,25 @@ define i1 @test(float %0, double %1) { ; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = fpext float 0.000000e+00 to double -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x double> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = fsub <8 x double> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP20]], <8 x double> [[TMP21]], <8 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = fptrunc <8 x double> [[TMP22]] to <8 x float> -; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x float> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = fcmp oeq <8 x float> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = freeze <8 x i1> [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP26]]) -; CHECK-NEXT: ret i1 [[TMP27]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) +; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> +; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) +; CHECK-NEXT: ret i1 [[TMP22]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index d326c855a1091..6ff03acf85cdf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,20 +4,19 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP77:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 undef, i32 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index d80d7b5ecd4e7..757d0b1708b6f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -8,19 +8,18 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[RC21]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4 +; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index fa022ad69af79..b0d9fea43a0e6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1016,15 +1016,13 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; THRESH-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> -; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP9]]) -; THRESH-NEXT: ret i32 [[TMP10]] +; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) +; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) +; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) +; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) +; THRESH-NEXT: ret i32 [[TMP8]] ; %2 = load i32, ptr @arr, align 16 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll index 6c4572593027d..54c950a078502 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll @@ -4,14 +4,20 @@ define void @inst_size(ptr %a, <2 x i64> %b) { ; CHECK-LABEL: @inst_size( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMPL1:%.*]] = load i64, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR2]], align 4 +; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 3 +; CHECK-NEXT: [[TMPL4:%.*]] = load i64, ptr [[PTR4]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMPL1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP2]], <2 x i64> [[TMP0]], i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]] ; CHECK-NEXT: br label [[BLOCK:%.*]] ; CHECK: block: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ] +; CHECK-NEXT: [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll index 47b42bc8f32a7..813c5e7418b30 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -10,10 +10,10 @@ define void @foo() personality ptr @bar { ; CHECK: bb2.loopexit: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP7:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb3: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP4:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 0, i32 0, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] ; CHECK: bb4: @@ -21,29 +21,30 @@ define void @foo() personality ptr @bar { ; CHECK: bb5: ; CHECK-NEXT: br label [[BB7:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ , [[BB8:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ , [[BB8:%.*]] ] +; CHECK-NEXT: [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[BB10]] ], [ [[TMP10:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP8]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 2) ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] +; CHECK-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 495a503311ab9..7201583f3450e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -136,42 +136,43 @@ for.end: ; preds = %for.body define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00 +; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 ; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], -; CHECK-NEXT: [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x float> [[TMP12]], +; CHECK-NEXT: [[TMP14]] = fadd <4 x float> [[TMP3]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP15]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP14]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP14]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP14]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP14]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 865d817866716..12389f4a3dbf4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -390,14 +390,15 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) -; CHECK-NEXT: ret i1 [[TMP8]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[X]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <8 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]]) +; CHECK-NEXT: ret i1 [[TMP9]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll index 7de2cde45525a..8aaa71ef47a8c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll @@ -10,16 +10,7 @@ ; YAML-NEXT: - String: 'SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '4' -; YAML-LABEL: --- !Passed -; YAML-NEXT: Pass: slp-vectorizer -; YAML-NEXT: Name: VectorizedList -; YAML-NEXT: Function: test -; YAML-NEXT: Args: -; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-2' -; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '2' +; YAML-NEXT: - TreeSize: '5' define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-LABEL: define <4 x float> @test( @@ -28,9 +19,8 @@ define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[V]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP1]], i64 2) ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]] ; CHECK-NEXT: ret <4 x float> [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index dadf5992ba288..c01c44ff03c15 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,23 +5,25 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG:%.*]], align 8 -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP5]], [[IF]] ], [ [[TMP10]], [[ELSE]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] ; CHECK-NEXT: ret void ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll index d45054b6bebce..207b2d45c335e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll @@ -14,7 +14,21 @@ declare void @unknown() define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-LABEL: @test( ; VECTOR_DBG-NEXT: entry: -; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; VECTOR_DBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 +; VECTOR_DBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; VECTOR_DBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 +; VECTOR_DBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 +; VECTOR_DBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() @@ -43,22 +57,22 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 +; VECTOR_DBG-NEXT: store float [[L0]], ptr [[B]], align 4 +; VECTOR_DBG-NEXT: store float [[L1]], ptr [[B1]], align 4 +; VECTOR_DBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 ; VECTOR_DBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_DBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_DBG-NEXT: ret void ; ; VECTOR_NODBG-LABEL: @test( ; VECTOR_NODBG-NEXT: entry: -; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; VECTOR_NODBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 +; VECTOR_NODBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; VECTOR_NODBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 +; VECTOR_NODBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; VECTOR_NODBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 +; VECTOR_NODBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() @@ -87,7 +101,9 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() -; VECTOR_NODBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 +; VECTOR_NODBG-NEXT: store float [[L0]], ptr [[B]], align 4 +; VECTOR_NODBG-NEXT: store float [[L1]], ptr [[B1]], align 4 +; VECTOR_NODBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 ; VECTOR_NODBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_NODBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_NODBG-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 6825f43b5a9eb..6ca1f8119c1cf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -14,22 +14,21 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 ; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 +; CHECK-NEXT: [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5 ; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7 -; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]] -; CHECK-NEXT: store <8 x i32> [[TMP11]], ptr [[P]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]] +; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -106,11 +105,10 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add ; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -165,14 +163,11 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6) +; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll index eb3d395f4c6a6..3eabed5882e58 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -160,9 +160,8 @@ define void @tiny_tree_not_fully_vectorizable2(ptr noalias nocapture %dst, ptr n ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[DST_ADDR_022]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> [[TMP2]], i64 2) +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index 6ac6884ca5377..e1b091cc6fcda 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,14 +8,14 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer