Skip to content
7 changes: 4 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7191,9 +7191,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan);
VPlanTransforms::removeBranchOnConst(BestVPlan);
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::cse(BestVPlan);
VPlanTransforms::removeDeadRecipes(BestVPlan);

Expand Down Expand Up @@ -8240,6 +8237,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL() && !HasScalarVF)
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());

if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
VPlans.push_back(std::move(P));

assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,7 @@ VPlan *VPlan::duplicate() {
}
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
Old2NewVPValues[&VF] = &NewPlan->VF;
Old2NewVPValues[&UF] = &NewPlan->UF;
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
if (BackedgeTakenCount) {
NewPlan->BackedgeTakenCount = new VPValue();
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -4081,6 +4081,9 @@ class VPlan {
/// Represents the vectorization factor of the loop.
VPValue VF;

/// Represents the symbolic unroll factor of the loop.
VPValue UF;

/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;

Expand Down Expand Up @@ -4232,6 +4235,9 @@ class VPlan {
/// Returns the VF of the vector loop region.
VPValue &getVF() { return VF; };

/// Returns the symbolic UF of the vector loop region.
VPValue &getSymbolicUF() { return UF; };
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately this cann't be made const, as it is used with replaceAllUsesWith, which cannot be const.


/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }

Expand All @@ -4241,6 +4247,12 @@ class VPlan {

void addVF(ElementCount VF) { VFs.insert(VF); }

/// Remove \p VF from the plan.
void removeVF(ElementCount VF) {
assert(hasVF(VF) && "tried to remove VF not present in plan");
VFs.remove(VF);
}

void setVF(ElementCount VF) {
assert(hasVF(VF) && "Cannot set VF not already in plan");
VFs.clear();
Expand Down
117 changes: 76 additions & 41 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3807,6 +3807,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
// used.
// TODO: Assert that they aren't used.

VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
Plan.getSymbolicUF().replaceAllUsesWith(UF);

// If there are no users of the runtime VF, compute VFxUF by constant folding
// the multiplication of VF and UF.
if (VF.getNumUsers() == 0) {
Expand All @@ -3826,7 +3829,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);

VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
Expand Down Expand Up @@ -3894,14 +3896,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return false;
}

/// Returns true if \p IR is a full interleave group with factor and number of
/// members both equal to \p VF. The interleave group must also access the full
/// vector width \p VectorRegWidth.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
unsigned VF, VPTypeAnalysis &TypeInfo,
unsigned VectorRegWidth) {
/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
/// number of members both equal to VF. The interleave group must also access
/// the full vector width \p VectorRegWidth.
static std::optional<ElementCount> isConsecutiveInterleaveGroup(
VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
if (!InterleaveR)
return false;
return std::nullopt;

Type *GroupElementTy = nullptr;
if (InterleaveR->getStoredValues().empty()) {
Expand All @@ -3910,21 +3912,35 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
return false;
return std::nullopt;
} else {
GroupElementTy =
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
if (!all_of(InterleaveR->getStoredValues(),
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
return false;
return std::nullopt;
}

unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
auto IG = InterleaveR->getInterleaveGroup();
return IG->getFactor() == VF && IG->getNumMembers() == VF &&
GroupSize == VectorRegWidth;
auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
TypeSize Size = TTI.getRegisterBitWidth(
VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
: TargetTransformInfo::RGK_ScalableVector);
assert(Size.isScalable() == VF.isScalable() &&
"if Size is scalable, VF must to and vice versa");
return Size.getKnownMinValue();
};

for (ElementCount VF : VFs) {
unsigned MinVal = VF.getKnownMinValue();
unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
auto IG = InterleaveR->getInterleaveGroup();
if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
GroupSize == GetVectorWidthForVF(VF))
return {VF};
}
return std::nullopt;
}

/// Returns true if \p VPValue is a narrow VPValue.
Expand All @@ -3935,16 +3951,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}

void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth) {
std::unique_ptr<VPlan>
VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
const TargetTransformInfo &TTI) {
using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();

if (!VectorLoop)
return;
return nullptr;

VPTypeAnalysis TypeInfo(Plan);

unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
std::optional<ElementCount> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
Expand All @@ -3959,30 +3977,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
return;
return nullptr;

auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
return;

// Do not narrow interleave groups if there are VectorPointer recipes and
// the plan was unrolled. The recipe implicitly uses VF from
// VPTransformState.
// TODO: Remove restriction once the VF for the VectorPointer offset is
// modeled explicitly as operand.
if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
return;
return nullptr;

// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;

// Bail out on non-consecutive interleave groups.
if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
VectorRegWidth))
return;

// Try to find a single VF, where all interleave groups are consecutive and
// saturate the full vector width. If we already have a candidate VF, check
// if it is applicable for the current InterleaveR, otherwise look for a
// suitable VF across the Plans VFs.
//
if (VFToOptimize) {
if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
TTI))
return nullptr;
} else {
if (auto VF = isConsecutiveInterleaveGroup(
InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI)) {
VFToOptimize = *VF;
}
if (!VFToOptimize)
return nullptr;
}
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
continue;
Expand Down Expand Up @@ -4016,24 +4038,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
if (!WideMember0)
return;
return nullptr;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
return;
return nullptr;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
return;
return nullptr;
}
StoreGroups.push_back(InterleaveR);
}

if (StoreGroups.empty())
return;
return nullptr;

// All interleave groups in Plan can be narrowed for VFToOptimize. Split the
// original Plan into 2: a) a new clone which contains all VFs of Plan, except
// VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
std::unique_ptr<VPlan> NewPlan;
if (size(Plan.vectorFactors()) != 1) {
NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
Plan.setVF(*VFToOptimize);
NewPlan->removeVF(*VFToOptimize);
}

// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
SmallPtrSet<VPValue *, 4> NarrowedOps;
Expand Down Expand Up @@ -4104,9 +4136,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
VPBuilder PHBuilder(Plan.getVectorPreheader());

VPValue *UF = Plan.getOrAddLiveIn(
ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
if (VF.isScalable()) {
VPValue *UF = &Plan.getSymbolicUF();
if (VFToOptimize->isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
CanIV->getScalarType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
Expand All @@ -4118,6 +4149,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
}
removeDeadRecipes(Plan);
assert(none_of(*VectorLoop->getEntryBasicBlock(),
IsaPred<VPVectorPointerRecipe>) &&
"All VPVectorPointerRecipes should have been removed");
return NewPlan;
}

/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
Expand Down
22 changes: 14 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,14 +333,20 @@ struct VPlanTransforms {
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
ScalarEvolution &SE);

/// Try to convert a plan with interleave groups with VF elements to a plan
/// with the interleave groups replaced by wide loads and stores processing VF
/// elements, if all transformed interleave groups access the full vector
/// width (checked via \o VectorRegWidth). This effectively is a very simple
/// form of loop-aware SLP, where we use interleave groups to identify
/// candidates.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth);
/// Try to find a single VF among \p Plan's VFs for which all interleave
/// groups (with known minimum VF elements) can be replaced by wide loads and
/// stores processing VF elements, if all transformed interleave groups access
/// the full vector width (checked via the maximum vector register width). If
/// the transformation can be applied, the original \p Plan will be split in
/// 2:
/// 1. The original Plan with the single VF containing the optimized recipes
/// using wide loads instead of interleave groups.
/// 2. A new clone which contains all VFs of Plan except the optimized VF.
///
/// This effectively is a very simple form of loop-aware SLP, where we use
/// interleave groups to identify candidates.
static std::unique_ptr<VPlan>
narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);

/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,28 +175,18 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
Expand Down Expand Up @@ -237,28 +227,18 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
Expand Down
Loading