From b3dfa6a56c36cda420efce4c492b3b9e34318125 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 18 Jun 2025 15:50:31 +0100 Subject: [PATCH 1/2] [LV] Use VPReductionRecipe for partial reductions Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Depends on https://github.com/llvm/llvm-project/pull/144281 --- .../Transforms/Vectorize/LoopVectorize.cpp | 42 +- llvm/lib/Transforms/Vectorize/VPlan.h | 173 ++--- .../Transforms/Vectorize/VPlanAnalysis.cpp | 19 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 167 +--- .../Transforms/Vectorize/VPlanTransforms.cpp | 6 +- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 12 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 - .../partial-reduce-dot-product-epilogue.ll | 246 ++---- .../partial-reduce-dot-product-neon.ll | 726 ++++++------------ .../AArch64/partial-reduce-dot-product.ll | 54 +- .../LoopVectorize/AArch64/vplan-printing.ll | 6 +- .../Transforms/Vectorize/VPlanTest.cpp | 8 +- 12 files changed, 514 insertions(+), 946 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e5c3f17860103..50846772c9ce4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6960,7 +6960,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } // The VPlan-based cost model is more accurate for partial reduction and // comparing against the legacy cost isn't desirable. - if (isa(&R)) + if (auto *VPR = dyn_cast(&R); + VPR && VPR->isPartialReduction()) return true; // The VPlan-based cost model can analyze if recipes are scalar @@ -8080,11 +8081,21 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); // If the PHI is used by a partial reduction, set the scale factor. + bool UseInLoopReduction = CM.isInLoopReduction(Phi); + bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc); unsigned ScaleFactor = getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); - PhiRecipe = new VPReductionPHIRecipe( - Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc), ScaleFactor); + RdxStyle Style(RdxNormal{}); + if (UseInLoopReduction) { + if (UseOrderedReductions) + Style = RdxOrderedInLoop{}; + else + Style = RdxInLoop{}; + } else if (ScaleFactor > 1) { + Style = RdxPartial{/*VFScaleFactor=*/ScaleFactor}; + } + PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc.getRecurrenceKind(), + *StartV, Style); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8152,7 +8163,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, VPValue *Accumulator = Operands[1]; VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); if (isa(BinOpRecipe) || - isa(BinOpRecipe)) + (isa(BinOpRecipe) && + cast(BinOpRecipe)->isPartialReduction())) std::swap(BinOp, Accumulator); if (ScaleFactor != @@ -8177,11 +8189,11 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0); - BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } - return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, - ScaleFactor, Reduction); + + return new VPReductionRecipe(RecurKind::Add, FastMathFlags(), Reduction, + Accumulator, BinOp, Cond, + RdxPartial{/*VFScaleFactor=*/ScaleFactor}); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -8687,9 +8699,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( FastMathFlags FMFs = isa(CurrentLinkI) ? RdxDesc.getFastMathFlags() : FastMathFlags(); - auto *RedRecipe = new VPReductionRecipe( - Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp, - PhiR->isOrdered(), CurrentLinkI->getDebugLoc()); + bool UseOrderedReductions = PhiR->isOrdered(); + RdxStyle Style = UseOrderedReductions ? RdxStyle(RdxOrderedInLoop{}) + : RdxStyle(RdxInLoop{}); + auto *RedRecipe = + new VPReductionRecipe(Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, + CondOp, Style, CurrentLinkI->getDebugLoc()); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. // Delete CurrentLink as it will be invalid if its operand is replaced @@ -8724,8 +8739,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Don't output selects for partial reductions because they have an output // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. + auto *RR = dyn_cast(OrigExitingVPV->getDefiningRecipe()); if (!PhiR->isInLoop() && CM.foldTailByMasking() && - !isa(OrigExitingVPV->getDefiningRecipe())) { + (!RR || !RR->isPartialReduction())) { VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent()); std::optional FMFs = PhiTy->isFloatingPointTy() diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cfe1f1e9d7528..60831198905b1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -43,6 +43,7 @@ #include #include #include +#include namespace llvm { @@ -553,7 +554,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenIntOrFpInductionSC: case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: - case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveEVLSC: @@ -2330,6 +2330,23 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { } }; +// Possible variants of a reduction. + +// This reduction is ordered and in-loop. +struct RdxOrderedInLoop {}; +// This reduction is in-loop. +struct RdxInLoop {}; +// This reduction isn't partial, ordered or in-loop. +struct RdxNormal {}; +// This reduction is partial and its output is a vector whose length is scaled +// by the VF. +struct RdxPartial { + // The factor by which the output is scaled down from the VF. + unsigned VFScaleFactor; +}; +typedef std::variant + RdxStyle; + /// A recipe for handling reduction phis. The start value is the first operand /// of the recipe and the incoming value from the backedge is the second /// operand. @@ -2338,32 +2355,21 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// The recurrence kind of the reduction. const RecurKind Kind; - /// The phi is part of an in-loop reduction. - bool IsInLoop; - - /// The phi is part of an ordered reduction. Requires IsInLoop to be true. - bool IsOrdered; - - /// When expanding the reduction PHI, the plan's VF element count is divided - /// by this factor to form the reduction phi's VF. - unsigned VFScaleFactor = 1; + RdxStyle Style; public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi. VPReductionPHIRecipe(PHINode *Phi, RecurKind Kind, VPValue &Start, - bool IsInLoop = false, bool IsOrdered = false, - unsigned VFScaleFactor = 1) + RdxStyle Style) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), Kind(Kind), - IsInLoop(IsInLoop), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) { - assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); - } + Style(Style) {} ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { auto *R = new VPReductionPHIRecipe( dyn_cast_or_null(getUnderlyingValue()), getRecurrenceKind(), - *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor); + *getOperand(0), Style); R->addOperand(getBackedgeValue()); return R; } @@ -2373,8 +2379,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; - /// Get the factor that the VF of this recipe's output should be scaled by. - unsigned getVFScaleFactor() const { return VFScaleFactor; } + /// Get the factor that the VF of this recipe's output should be scaled by, or + /// null if it isn't scaled. + std::optional getVFScaleFactor() const { + auto *Partial = std::get_if(&Style); + return Partial ? std::make_optional(Partial->VFScaleFactor) : std::nullopt; + } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -2391,10 +2401,18 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, RecurKind getRecurrenceKind() const { return Kind; } /// Returns true, if the phi is part of an ordered reduction. - bool isOrdered() const { return IsOrdered; } + bool isOrdered() const { + return std::holds_alternative(Style); + } + + /// Returns true if the phi is part of an in-loop reduction. + bool isInLoop() const { + return std::holds_alternative(Style) || + std::holds_alternative(Style); + } - /// Returns true, if the phi is part of an in-loop reduction. - bool isInLoop() const { return IsInLoop; } + /// Returns true if the reduction outputs a vector with a scaled down VF. + bool isPartialReduction() const { return getVFScaleFactor() > 1; } /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -2666,23 +2684,25 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { } }; -/// A recipe to represent inloop reduction operations, performing a reduction on -/// a vector operand into a scalar value, and adding the result to a chain. -/// The Operands are {ChainOp, VecOp, [Condition]}. +/// A recipe to represent inloop, ordered or partial reduction operations. It +/// performs a reduction on a vector operand into a scalar (vector in the case +/// of a partial reduction) value, and adds the result to a chain. The Operands +/// are {ChainOp, VecOp, [Condition]}. class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { + /// The recurrence kind for the reduction in question. RecurKind RdxKind; - bool IsOrdered; /// Whether the reduction is conditional. bool IsConditional = false; + RdxStyle Style; protected: VPReductionRecipe(const unsigned char SC, RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, ArrayRef Operands, VPValue *CondOp, - bool IsOrdered, DebugLoc DL) + RdxStyle Style, DebugLoc DL) : VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind), - IsOrdered(IsOrdered) { + Style(Style) { if (CondOp) { IsConditional = true; addOperand(CondOp); @@ -2693,30 +2713,29 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) + RdxStyle Style, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, - ArrayRef({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + ArrayRef({ChainOp, VecOp}), CondOp, Style, + DL) {} VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) + RdxStyle Style, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, - ArrayRef({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + ArrayRef({ChainOp, VecOp}), CondOp, Style, + DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { return new VPReductionRecipe(RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(), getVecOp(), - getCondOp(), IsOrdered, getDebugLoc()); + getCondOp(), Style, getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || - R->getVPDefID() == VPRecipeBase::VPPartialReductionSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; } static inline bool classof(const VPUser *U) { @@ -2749,9 +2768,13 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { /// Return the recurrence kind for the in-loop reduction. RecurKind getRecurrenceKind() const { return RdxKind; } /// Return true if the in-loop reduction is ordered. - bool isOrdered() const { return IsOrdered; }; + bool isOrdered() const { + return std::holds_alternative(Style); + }; /// Return true if the in-loop reduction is conditional. bool isConditional() const { return IsConditional; }; + /// Returns true if the reduction outputs a vector with a scaled down VF. + bool isPartialReduction() const { return getVFScaleFactor() > 1; } /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } /// The VPValue of the vector value to be reduced. @@ -2760,68 +2783,12 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { VPValue *getCondOp() const { return isConditional() ? getOperand(getNumOperands() - 1) : nullptr; } -}; - -/// A recipe for forming partial reductions. In the loop, an accumulator and -/// vector operand are added together and passed to the next iteration as the -/// next accumulator. After the loop body, the accumulator is reduced to a -/// scalar value. -class VPPartialReductionRecipe : public VPReductionRecipe { - unsigned Opcode; - - /// The divisor by which the VF of this recipe's output should be divided - /// during execution. - unsigned VFScaleFactor; - -public: - VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor) - : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond, - VFScaleFactor, ReductionInst) {} - VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, - VPValue *Cond, unsigned ScaleFactor, - Instruction *ReductionInst = nullptr) - : VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add, - FastMathFlags(), ReductionInst, - ArrayRef({Op0, Op1}), Cond, false, {}), - Opcode(Opcode), VFScaleFactor(ScaleFactor) { - [[maybe_unused]] auto *AccumulatorRecipe = - getChainOp()->getDefiningRecipe(); - // When cloning as part of a VPExpressionRecipe the chain op could have - // replaced by a temporary VPValue, so it doesn't have a defining recipe. - assert((!AccumulatorRecipe || - isa(AccumulatorRecipe) || - isa(AccumulatorRecipe)) && - "Unexpected operand order for partial reduction recipe"); - } - ~VPPartialReductionRecipe() override = default; - - VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - getCondOp(), VFScaleFactor, - getUnderlyingInstr()); - } - - VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) - - /// Generate the reduction in the loop. - void execute(VPTransformState &State) override; - - /// Return the cost of this VPPartialReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - - /// Get the binary op's opcode. - unsigned getOpcode() const { return Opcode; } - - /// Get the factor that the VF of this recipe's output should be scaled by. - unsigned getVFScaleFactor() const { return VFScaleFactor; } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif + /// Get the factor that the VF of this recipe's output should be scaled by, or + /// null if it isn't scaled. + std::optional getVFScaleFactor() const { + auto *Partial = std::get_if(&Style); + return Partial ? std::make_optional(Partial->VFScaleFactor) : std::nullopt; + } }; /// A recipe to represent inloop reduction operations with vector-predication @@ -2837,7 +2804,9 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe { R.getFastMathFlags(), cast_or_null(R.getUnderlyingValue()), ArrayRef({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, - R.isOrdered(), DL) {} + R.isOrdered() ? RdxStyle(RdxOrderedInLoop{}) + : RdxStyle(RdxInLoop{}), + DL) {} ~VPReductionEVLRecipe() override = default; @@ -3101,8 +3070,8 @@ class VPExpressionRecipe : public VPSingleDefRecipe { void decompose(); unsigned getVFScaleFactor() const { - auto *PR = dyn_cast(ExpressionRecipes.back()); - return PR ? PR->getVFScaleFactor() : 1; + auto *PR = dyn_cast(ExpressionRecipes.back()); + return PR ? PR->getVFScaleFactor().value_or(1) : 1; } /// Method for generating code, must not be called as this recipe is abstract. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 80a2e4bc3f754..f84a7914ec850 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -288,10 +288,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case( @@ -561,11 +561,12 @@ SmallVector llvm::calculateRegisterUsageForPlan( // fewer lanes than the VF. unsigned ScaleFactor = vputils::getVFScaleFactor(VPV->getDefiningRecipe()); - ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); - LLVM_DEBUG(if (VF != VFs[J]) { - dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF - << " for " << *R << "\n"; - }); + ElementCount VF = VFs[J]; + if (ScaleFactor > 1) { + VF = VFs[J].divideCoefficientBy(ScaleFactor); + LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] + << " to " << VF << " for " << *R << "\n";); + } Type *ScalarTy = TypeInfo.inferScalarType(VPV); unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1ee405a62aa68..28d38a4e4b731 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -176,7 +176,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { return cast(this)->mayHaveSideEffects(); case VPBlendSC: case VPReductionEVLSC: - case VPPartialReductionSC: case VPReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: @@ -305,119 +304,6 @@ bool VPRecipeBase::isScalarCast() const { return VPI && Instruction::isCast(VPI->getOpcode()); } -InstructionCost -VPPartialReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - std::optional Opcode; - VPValue *Op = getVecOp(); - uint64_t MulConst; - // If the partial reduction is predicated, a select will be operand 1. - // If it isn't predicated and the mul isn't operating on a constant, then it - // should have been turned into a VPExpressionRecipe. - // FIXME: Replace the entire function with this once all partial reduction - // variants are bundled into VPExpressionRecipe. - if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) && - !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { - auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); - auto *InputType = Ctx.Types.inferScalarType(getVecOp()); - return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType, - PhiType, VF, TTI::PR_None, - TTI::PR_None, {}, Ctx.CostKind); - } - - VPRecipeBase *OpR = Op->getDefiningRecipe(); - Type *InputTypeA = nullptr, *InputTypeB = nullptr; - TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, - ExtBType = TTI::PR_None; - - auto GetExtendKind = [](VPRecipeBase *R) { - if (!R) - return TTI::PR_None; - auto *WidenCastR = dyn_cast(R); - if (!WidenCastR) - return TTI::PR_None; - if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) - return TTI::PR_ZeroExtend; - if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) - return TTI::PR_SignExtend; - return TTI::PR_None; - }; - - // Pick out opcode, type/ext information and use sub side effects from a widen - // recipe. - auto HandleWiden = [&](VPWidenRecipe *Widen) { - if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) { - Widen = dyn_cast(Op->getDefiningRecipe()); - } - Opcode = Widen->getOpcode(); - VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe(); - VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe(); - InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) - : Widen->getOperand(0)); - InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) - : Widen->getOperand(1)); - ExtAType = GetExtendKind(ExtAR); - ExtBType = GetExtendKind(ExtBR); - - using namespace VPlanPatternMatch; - const APInt *C; - if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) && - canConstantBeExtended(C, InputTypeA, ExtAType)) { - InputTypeB = InputTypeA; - ExtBType = ExtAType; - } - }; - - if (isa(OpR)) { - InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0)); - ExtAType = GetExtendKind(OpR); - } else if (isa(OpR)) { - auto RedPhiOp1R = getOperand(1)->getDefiningRecipe(); - if (isa(RedPhiOp1R)) { - InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0)); - ExtAType = GetExtendKind(RedPhiOp1R); - } else if (auto Widen = dyn_cast(RedPhiOp1R)) - HandleWiden(Widen); - } else if (auto Widen = dyn_cast(OpR)) { - HandleWiden(Widen); - } else if (auto Reduction = dyn_cast(OpR)) { - return Reduction->computeCost(VF, Ctx); - } - auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); - return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, - PhiType, VF, ExtAType, ExtBType, - Opcode, Ctx.CostKind); -} - -void VPPartialReductionRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - - assert(getOpcode() == Instruction::Add && - "Unhandled partial reduction opcode"); - - Value *BinOpVal = State.get(getOperand(1)); - Value *PhiVal = State.get(getOperand(0)); - assert(PhiVal && BinOpVal && "Phi and Mul must be set"); - - Type *RetTy = PhiVal->getType(); - - CallInst *V = - Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add, - {PhiVal, BinOpVal}, nullptr, "partial.reduce"); - - State.set(this, V); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "PARTIAL-REDUCE "; - printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; - printOperands(O, SlotTracker); -} -#endif - void VPIRFlags::intersectFlags(const VPIRFlags &Other) { assert(OpType == Other.OpType && "OpType must match"); switch (OpType) { @@ -2684,7 +2570,6 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Reduction being replicated."); - Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); RecurKind Kind = getRecurrenceKind(); assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && "In-loop AnyOf reductions aren't currently supported"); @@ -2706,7 +2591,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { } Value *NewRed; Value *NextInChain; - if (IsOrdered) { + if (isOrdered()) { + Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); if (State.VF.isVector()) NewRed = createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain); @@ -2716,8 +2602,16 @@ void VPReductionRecipe::execute(VPTransformState &State) { PrevInChain, NewVecOp); PrevInChain = NewRed; NextInChain = NewRed; + } else if (isPartialReduction()) { + assert(Kind == RecurKind::Add && "Unexpected partial reduction kind"); + Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false); + NewRed = State.Builder.CreateIntrinsic( + PrevInChain->getType(), Intrinsic::vector_partial_reduce_add, + {PrevInChain, NewVecOp}, nullptr, "partial.reduce"); + PrevInChain = NewRed; + NextInChain = NewRed; } else { - PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); + Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain); @@ -2726,7 +2620,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), PrevInChain, NewRed); } - State.set(this, NextInChain, /*IsScalar*/ true); + State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction()); } void VPReductionEVLRecipe::execute(VPTransformState &State) { @@ -2773,6 +2667,12 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, std::optional OptionalFMF = ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; + if (isPartialReduction()) + return Ctx.TTI.getPartialReductionCost( + Opcode, ElementTy, ElementTy, ElementTy, VF, + TargetTransformInfo::PR_None, TargetTransformInfo::PR_None, + std::nullopt, Ctx.CostKind); + // TODO: Support any-of reductions. assert( (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || @@ -2878,7 +2778,9 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, unsigned Opcode = RecurrenceDescriptor::getOpcode( cast(ExpressionRecipes[1])->getRecurrenceKind()); auto *ExtR = cast(ExpressionRecipes[0]); - return isa(ExpressionRecipes.back()) + + return cast(ExpressionRecipes.back()) + ->isPartialReduction() ? Ctx.TTI.getPartialReductionCost( Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF, @@ -2898,7 +2800,8 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, Opcode = Instruction::Sub; [[fallthrough]]; case ExpressionTypes::ExtMulAccReduction: { - if (isa(ExpressionRecipes.back())) { + auto *RedR = cast(ExpressionRecipes.back()); + if (RedR->isPartialReduction()) { auto *Ext0R = cast(ExpressionRecipes[0]); auto *Ext1R = cast(ExpressionRecipes[1]); auto *Mul = cast(ExpressionRecipes[2]); @@ -2937,8 +2840,8 @@ bool VPExpressionRecipe::mayHaveSideEffects() const { bool VPExpressionRecipe::isSingleScalar() const { // Cannot use vputils::isSingleScalar(), because all external operands // of the expression will be live-ins while bundled. - return isa(ExpressionRecipes.back()) && - !isa(ExpressionRecipes.back()); + auto *RR = dyn_cast(ExpressionRecipes.back()); + return RR && !RR->isPartialReduction(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2950,12 +2853,11 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, O << " = "; auto *Red = cast(ExpressionRecipes.back()); unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - bool IsPartialReduction = isa(Red); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { getOperand(1)->printAsOperand(O, SlotTracker); - O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; O << Instruction::getOpcodeName(Opcode) << " ("; getOperand(0)->printAsOperand(O, SlotTracker); Red->printFlags(O); @@ -2972,7 +2874,7 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, } case ExpressionTypes::ExtNegatedMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " (sub (0, mul"; @@ -2997,7 +2899,7 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, case ExpressionTypes::MulAccReduction: case ExpressionTypes::ExtMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " ("; @@ -3034,7 +2936,10 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "REDUCE "; + if (isPartialReduction()) + O << Indent << "PARTIAL-REDUCE "; + else + O << Indent << "REDUCE "; printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); @@ -4416,7 +4321,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // this value when we vectorize all of the instructions that use the PHI. BasicBlock *VectorPH = State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); - bool ScalarPHI = State.VF.isScalar() || IsInLoop; + bool ScalarPHI = State.VF.isScalar() || isInLoop(); Value *StartV = State.get(StartVPV, ScalarPHI); Type *VecTy = StartV->getType(); @@ -4425,7 +4330,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { "recipe must be in the vector loop header"); auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); Phi->insertBefore(HeaderBB->getFirstInsertionPt()); - State.set(this, Phi, IsInLoop); + State.set(this, Phi, isInLoop()); Phi->addIncoming(StartV, VectorPH); } @@ -4438,8 +4343,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); - if (VFScaleFactor != 1) - O << " (VF scaled by 1/" << VFScaleFactor << ")"; + if (getVFScaleFactor() > 1) + O << " (VF scaled by 1/" << getVFScaleFactor() << ")"; } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 2588c878d8472..50f9578196594 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3551,7 +3551,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, cast(VecOp)->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); - if (isa(Red)) { + if (Red->isPartialReduction()) { TargetTransformInfo::PartialReductionExtendKind ExtKind = TargetTransformInfo::getPartialReductionExtendKind(ExtOpc); // FIXME: Move partial reduction creation, costing and clamping @@ -3592,7 +3592,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { - bool IsPartialReduction = isa(Red); + bool IsPartialReduction = Red->isPartialReduction(); unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); if (Opcode != Instruction::Add && Opcode != Instruction::Sub) @@ -3611,7 +3611,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; InstructionCost MulAccCost; - if (IsPartialReduction) { + if (Red->isPartialReduction()) { Type *SrcTy2 = Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr; // FIXME: Move partial reduction creation, costing and clamping diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index c6380d30ab2e2..c14c7921978df 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -175,10 +175,8 @@ bool vputils::isSingleScalar(const VPValue *VPV) { return VPI->isSingleScalar() || VPI->isVectorToScalar() || (PreservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isSingleScalar)); - if (isa(VPV)) - return false; - if (isa(VPV)) - return true; + if (auto *RR = dyn_cast(VPV)) + return !RR->isPartialReduction(); if (auto *Expr = dyn_cast(VPV)) return Expr->isSingleScalar(); @@ -242,9 +240,9 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { if (!R) return 1; if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); + return RR->getVFScaleFactor().value_or(1); + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor().value_or(1); if (auto *ER = dyn_cast(R)) return ER->getVFScaleFactor(); assert( diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 83e3fcaaeee2b..b4eff8ccd63e8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -341,7 +341,6 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, - VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, VPVectorPointerSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index d8f1a86c9ebda..1b523940cbf20 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -182,316 +182,236 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: ; CHECK-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: ; CHECK-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK: pred.load.if5: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: ; CHECK-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK: pred.load.if7: +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.continue8: ; CHECK-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK: pred.load.if9: +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK: pred.load.continue10: ; CHECK-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK: pred.load.if11: +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK: pred.load.continue12: ; CHECK-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK: pred.load.if13: +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK: pred.load.continue14: ; CHECK-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK: pred.load.if15: +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK: pred.load.continue16: ; CHECK-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK: pred.load.if17: +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK: pred.load.continue18: ; CHECK-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK: pred.load.if19: +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK: pred.load.continue20: ; CHECK-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK: pred.load.if21: +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK: pred.load.continue22: ; CHECK-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK: pred.load.if23: +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK: pred.load.continue24: ; CHECK-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK: pred.load.if25: +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK: pred.load.continue26: ; CHECK-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK: pred.load.if27: +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK: pred.load.continue28: ; CHECK-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK: pred.load.if29: +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: -; CHECK-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: -; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: -; CHECK-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: -; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: -; CHECK-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK: pred.load.if35: -; CHECK-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: -; CHECK-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK: pred.load.if37: -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK: pred.load.continue38: -; CHECK-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK: pred.load.if39: -; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK: pred.load.continue40: -; CHECK-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK: pred.load.if41: -; CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK: pred.load.continue42: -; CHECK-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK: pred.load.if43: -; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK: pred.load.continue44: -; CHECK-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK: pred.load.if45: -; CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK: pred.load.continue46: -; CHECK-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK: pred.load.if47: -; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK: pred.load.continue48: -; CHECK-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK: pred.load.if49: -; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK: pred.load.continue50: -; CHECK-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK: pred.load.if51: -; CHECK-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK: pred.load.continue52: -; CHECK-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK: pred.load.if53: -; CHECK-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK: pred.load.continue54: -; CHECK-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK: pred.load.if55: -; CHECK-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK: pred.load.continue56: -; CHECK-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK: pred.load.if57: -; CHECK-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK: pred.load.continue58: -; CHECK-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK: pred.load.if59: -; CHECK-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK: pred.load.continue60: -; CHECK-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK: pred.load.if61: ; CHECK-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK: pred.load.continue62: -; CHECK-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK: pred.load.continue30: +; CHECK-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] -; CHECK-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) +; CHECK-NEXT: [[TMP183:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP183]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index e74830700776c..6ead2a4eecbe8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -997,313 +997,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-INTERLEAVE1: pred.load.if: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-INTERLEAVE1: pred.load.continue: ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-INTERLEAVE1: pred.load.if1: +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-INTERLEAVE1: pred.load.continue2: ; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-INTERLEAVE1: pred.load.if3: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-INTERLEAVE1: pred.load.continue4: ; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-INTERLEAVE1: pred.load.if5: +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-INTERLEAVE1: pred.load.continue6: ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-INTERLEAVE1: pred.load.if7: +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-INTERLEAVE1: pred.load.continue8: ; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-INTERLEAVE1: pred.load.if9: +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-INTERLEAVE1: pred.load.continue10: ; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-INTERLEAVE1: pred.load.if11: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-INTERLEAVE1: pred.load.continue12: ; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-INTERLEAVE1: pred.load.if13: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-INTERLEAVE1: pred.load.continue14: ; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-INTERLEAVE1: pred.load.if15: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-INTERLEAVE1: pred.load.continue16: ; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-INTERLEAVE1: pred.load.if17: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-INTERLEAVE1: pred.load.continue18: ; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-INTERLEAVE1: pred.load.if19: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-INTERLEAVE1: pred.load.continue20: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-INTERLEAVE1: pred.load.if21: +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-INTERLEAVE1: pred.load.continue22: ; CHECK-INTERLEAVE1-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-INTERLEAVE1: pred.load.if23: +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-INTERLEAVE1-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-INTERLEAVE1: pred.load.continue24: ; CHECK-INTERLEAVE1-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-INTERLEAVE1: pred.load.if25: +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-INTERLEAVE1: pred.load.continue26: ; CHECK-INTERLEAVE1-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-INTERLEAVE1: pred.load.if27: +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-INTERLEAVE1: pred.load.continue28: ; CHECK-INTERLEAVE1-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-INTERLEAVE1: pred.load.if29: +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-INTERLEAVE1: pred.load.continue30: -; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-INTERLEAVE1: pred.load.if31: -; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-INTERLEAVE1: pred.load.continue32: -; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-INTERLEAVE1: pred.load.if33: -; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-INTERLEAVE1: pred.load.continue34: -; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-INTERLEAVE1: pred.load.if35: -; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-INTERLEAVE1: pred.load.continue36: -; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-INTERLEAVE1: pred.load.if37: -; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-INTERLEAVE1: pred.load.continue38: -; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-INTERLEAVE1: pred.load.if39: -; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-INTERLEAVE1: pred.load.continue40: -; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-INTERLEAVE1: pred.load.if41: -; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-INTERLEAVE1: pred.load.continue42: -; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-INTERLEAVE1: pred.load.if43: -; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-INTERLEAVE1: pred.load.continue44: -; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-INTERLEAVE1: pred.load.if45: -; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-INTERLEAVE1: pred.load.continue46: -; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-INTERLEAVE1: pred.load.if47: -; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-INTERLEAVE1: pred.load.continue48: -; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-INTERLEAVE1: pred.load.if49: -; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-INTERLEAVE1: pred.load.continue50: -; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-INTERLEAVE1: pred.load.if51: -; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-INTERLEAVE1: pred.load.continue52: -; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-INTERLEAVE1: pred.load.if53: -; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-INTERLEAVE1: pred.load.continue54: -; CHECK-INTERLEAVE1-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-INTERLEAVE1: pred.load.if55: -; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-INTERLEAVE1: pred.load.continue56: -; CHECK-INTERLEAVE1-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-INTERLEAVE1: pred.load.if57: -; CHECK-INTERLEAVE1-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-INTERLEAVE1: pred.load.continue58: -; CHECK-INTERLEAVE1-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-INTERLEAVE1: pred.load.if59: -; CHECK-INTERLEAVE1-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-INTERLEAVE1: pred.load.continue60: -; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVE1: pred.load.if61: ; CHECK-INTERLEAVE1-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVE1: pred.load.continue62: -; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVE1: pred.load.continue30: +; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-INTERLEAVE1-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) @@ -1333,313 +1253,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-INTERLEAVED: pred.load.if: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-INTERLEAVED: pred.load.continue: ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-INTERLEAVED: pred.load.if1: +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-INTERLEAVED: pred.load.continue2: ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-INTERLEAVED: pred.load.if3: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-INTERLEAVED: pred.load.continue4: ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-INTERLEAVED: pred.load.if5: +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-INTERLEAVED: pred.load.continue6: ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-INTERLEAVED: pred.load.if7: +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-INTERLEAVED: pred.load.continue8: ; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-INTERLEAVED: pred.load.if9: +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-INTERLEAVED: pred.load.continue10: ; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-INTERLEAVED: pred.load.if11: +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-INTERLEAVED: pred.load.continue12: ; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-INTERLEAVED: pred.load.if13: +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-INTERLEAVED: pred.load.continue14: ; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-INTERLEAVED: pred.load.if15: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-INTERLEAVED: pred.load.continue16: ; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-INTERLEAVED: pred.load.if17: +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-INTERLEAVED: pred.load.continue18: ; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-INTERLEAVED: pred.load.if19: +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-INTERLEAVED: pred.load.continue20: ; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-INTERLEAVED: pred.load.if21: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-INTERLEAVED: pred.load.continue22: ; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-INTERLEAVED: pred.load.if23: +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-INTERLEAVED: pred.load.continue24: ; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-INTERLEAVED: pred.load.if25: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-INTERLEAVED: pred.load.continue26: ; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-INTERLEAVED: pred.load.if27: +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-INTERLEAVED: pred.load.continue28: ; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-INTERLEAVED: pred.load.if29: +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-INTERLEAVED: pred.load.continue30: -; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-INTERLEAVED: pred.load.if31: -; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-INTERLEAVED: pred.load.continue32: -; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-INTERLEAVED: pred.load.if33: -; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-INTERLEAVED: pred.load.continue34: -; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-INTERLEAVED: pred.load.if35: -; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-INTERLEAVED: pred.load.continue36: -; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-INTERLEAVED: pred.load.if37: -; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-INTERLEAVED: pred.load.continue38: -; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-INTERLEAVED: pred.load.if39: -; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-INTERLEAVED: pred.load.continue40: -; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-INTERLEAVED: pred.load.if41: -; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-INTERLEAVED: pred.load.continue42: -; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-INTERLEAVED: pred.load.if43: -; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-INTERLEAVED: pred.load.continue44: -; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-INTERLEAVED: pred.load.if45: -; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-INTERLEAVED: pred.load.continue46: -; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-INTERLEAVED: pred.load.if47: -; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-INTERLEAVED: pred.load.continue48: -; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-INTERLEAVED: pred.load.if49: -; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-INTERLEAVED: pred.load.continue50: -; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-INTERLEAVED: pred.load.if51: -; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-INTERLEAVED: pred.load.continue52: -; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-INTERLEAVED: pred.load.if53: -; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-INTERLEAVED: pred.load.continue54: -; CHECK-INTERLEAVED-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-INTERLEAVED: pred.load.if55: -; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-INTERLEAVED: pred.load.continue56: -; CHECK-INTERLEAVED-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-INTERLEAVED: pred.load.if57: -; CHECK-INTERLEAVED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-INTERLEAVED: pred.load.continue58: -; CHECK-INTERLEAVED-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-INTERLEAVED: pred.load.if59: -; CHECK-INTERLEAVED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-INTERLEAVED: pred.load.continue60: -; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVED: pred.load.if61: ; CHECK-INTERLEAVED-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVED: pred.load.continue62: -; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVED: pred.load.continue30: +; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-INTERLEAVED-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) @@ -1669,313 +1509,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-MAXBW: pred.load.if: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-MAXBW: pred.load.continue: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-MAXBW: pred.load.if1: +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-MAXBW: pred.load.continue2: ; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-MAXBW-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-MAXBW: pred.load.if3: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-MAXBW: pred.load.continue4: ; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-MAXBW: pred.load.if5: +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-MAXBW: pred.load.continue6: ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-MAXBW: pred.load.if7: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-MAXBW: pred.load.continue8: ; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-MAXBW-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-MAXBW: pred.load.if9: +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-MAXBW: pred.load.continue10: ; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-MAXBW-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-MAXBW: pred.load.if11: +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-MAXBW: pred.load.continue12: ; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-MAXBW-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-MAXBW: pred.load.if13: +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-MAXBW: pred.load.continue14: ; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-MAXBW-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-MAXBW: pred.load.if15: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-MAXBW: pred.load.continue16: ; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-MAXBW-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-MAXBW: pred.load.if17: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-MAXBW: pred.load.continue18: ; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-MAXBW-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-MAXBW: pred.load.if19: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-MAXBW-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-MAXBW: pred.load.continue20: ; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-MAXBW-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-MAXBW-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-MAXBW: pred.load.if21: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-MAXBW-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-MAXBW: pred.load.continue22: ; CHECK-MAXBW-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-MAXBW-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-MAXBW-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-MAXBW: pred.load.if23: +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-MAXBW-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-MAXBW-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-MAXBW-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-MAXBW: pred.load.continue24: ; CHECK-MAXBW-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-MAXBW-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-MAXBW-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-MAXBW: pred.load.if25: +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-MAXBW-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-MAXBW-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-MAXBW: pred.load.continue26: ; CHECK-MAXBW-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-MAXBW-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-MAXBW-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-MAXBW: pred.load.if27: +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-MAXBW-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-MAXBW-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-MAXBW-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-MAXBW: pred.load.continue28: ; CHECK-MAXBW-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-MAXBW-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-MAXBW: pred.load.if29: +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-MAXBW-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-MAXBW-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-MAXBW: pred.load.continue30: -; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-MAXBW: pred.load.if31: -; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-MAXBW: pred.load.continue32: -; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-MAXBW-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-MAXBW: pred.load.if33: -; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-MAXBW: pred.load.continue34: -; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-MAXBW-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-MAXBW: pred.load.if35: -; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-MAXBW: pred.load.continue36: -; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-MAXBW-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-MAXBW: pred.load.if37: -; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-MAXBW: pred.load.continue38: -; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-MAXBW-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-MAXBW: pred.load.if39: -; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-MAXBW: pred.load.continue40: -; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-MAXBW-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-MAXBW: pred.load.if41: -; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-MAXBW: pred.load.continue42: -; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-MAXBW-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-MAXBW: pred.load.if43: -; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-MAXBW: pred.load.continue44: -; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-MAXBW-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-MAXBW: pred.load.if45: -; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-MAXBW: pred.load.continue46: -; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-MAXBW-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-MAXBW: pred.load.if47: -; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-MAXBW: pred.load.continue48: -; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-MAXBW-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-MAXBW: pred.load.if49: -; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-MAXBW: pred.load.continue50: -; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-MAXBW-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-MAXBW: pred.load.if51: -; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-MAXBW: pred.load.continue52: -; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-MAXBW-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-MAXBW: pred.load.if53: -; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-MAXBW-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-MAXBW: pred.load.continue54: -; CHECK-MAXBW-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-MAXBW-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-MAXBW-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-MAXBW: pred.load.if55: -; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-MAXBW-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-MAXBW-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-MAXBW: pred.load.continue56: -; CHECK-MAXBW-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-MAXBW-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-MAXBW-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-MAXBW: pred.load.if57: -; CHECK-MAXBW-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-MAXBW-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-MAXBW: pred.load.continue58: -; CHECK-MAXBW-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-MAXBW-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-MAXBW-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-MAXBW: pred.load.if59: -; CHECK-MAXBW-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-MAXBW-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-MAXBW: pred.load.continue60: -; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-MAXBW-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-MAXBW-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-MAXBW: pred.load.if61: ; CHECK-MAXBW-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-MAXBW-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-MAXBW: pred.load.continue62: -; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-MAXBW: pred.load.continue30: +; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-MAXBW-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-MAXBW-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index d77ca9875bf01..b6147e194426b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1279,13 +1279,13 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -1307,23 +1307,23 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVE1: exit: -; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP22]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP16]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_predicated( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -1345,23 +1345,23 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVED: exit: -; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP16]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp_predicated( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -1383,10 +1383,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true ; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] ; CHECK-MAXBW: exit: -; CHECK-MAXBW-NEXT: ret i32 [[TMP21]] +; CHECK-MAXBW-NEXT: ret i32 [[TMP15]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 62e248bed85d9..a9e23113a43ab 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -89,9 +89,9 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+do ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 ; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> -; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<16> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<1024> +; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = ir<%accum> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> +; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 59a9ea1a720b3..a747903dc5a9d 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1220,7 +1220,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + CondOp, VecOp, false, 1); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1235,7 +1235,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + CondOp, VecOp, false, 1); VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 4)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); EXPECT_FALSE(EVLRecipe.mayHaveSideEffects()); @@ -1593,7 +1593,7 @@ TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) { VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + CondOp, VecOp, false, 1); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1608,7 +1608,7 @@ TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + CondOp, VecOp, false, 1); VPValue *EVL = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); EXPECT_TRUE(isa(&EVLRecipe)); From 0fe3c786328c8925eb206d97651c6ab65afbcaf2 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 4 Nov 2025 16:47:31 +0000 Subject: [PATCH 2/2] Rebase --- .../Transforms/Vectorize/VPlanTransforms.cpp | 2 -- .../AArch64/partial-reduce-dot-product.ll | 18 +++++++++--------- .../LoopVectorize/AArch64/vplan-printing.ll | 8 ++++---- .../Transforms/Vectorize/VPlanTest.cpp | 8 ++++---- 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 50f9578196594..6043f41e32a80 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3592,8 +3592,6 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { - bool IsPartialReduction = Red->isPartialReduction(); - unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index b6147e194426b..dfc54d658d93a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1301,13 +1301,13 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVE1: exit: ; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP16]] @@ -1339,13 +1339,13 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVED: exit: ; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP16]] @@ -1370,15 +1370,15 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP18]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true ; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index a9e23113a43ab..ddcd2dfb850a5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -80,8 +80,8 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+do ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%1>, ir<%add> (VF scaled by 1/4) +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<%1>, ir<[[RDX_NEXT:%.+]]> (VF scaled by 1/4) ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index> ; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a> ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index> @@ -89,13 +89,13 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+do ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 ; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> -; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = ir<%accum> + reduce.add (ir<%mul>) +; CHECK-NEXT: PARTIAL-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul>) ; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> ; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%3> = compute-reduction-result ir<%accum>, ir<%add> +; CHECK-NEXT: EMIT vp<%3> = compute-reduction-result ir<[[RDX]]>, ir<[[RDX_NEXT]]> ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index a747903dc5a9d..4caba92bd7233 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1220,7 +1220,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false, 1); + CondOp, VecOp, RdxNormal{}); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1235,7 +1235,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false, 1); + CondOp, VecOp, RdxNormal{}); VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 4)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); EXPECT_FALSE(EVLRecipe.mayHaveSideEffects()); @@ -1593,7 +1593,7 @@ TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) { VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false, 1); + CondOp, VecOp, RdxNormal{}); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1608,7 +1608,7 @@ TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false, 1); + CondOp, VecOp, RdxNormal{}); VPValue *EVL = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); EXPECT_TRUE(isa(&EVLRecipe));