Skip to content

Commit dc6be02

Browse files
committed
[VPlan] Run narrowInterleaveGroups during general VPlan optimizations.
Move narrowInterleaveGroups to to general VPlan optimization stage. To do so, narrowInterleaveGroups now has to find a suitable VF where all interleave groups are consecutive and saturate the full vector width. If such a VF is found, the original VPlan is split into 2: a) a new clone which contains all VFs of Plan, except VFToOptimize, and b) the original Plan with VFToOptimize as single VF. The original Plan is then optimized. If a new copy for the other VFs has been created, it is returned and the caller has to add it to the list of candidate plans. Together with #149702, this allows to take the narrowed interleave groups into account when interleaving.
1 parent 80f4183 commit dc6be02

11 files changed

+245
-169
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7223,9 +7223,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72237223
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
72247224
VPlanTransforms::simplifyRecipes(BestVPlan);
72257225
VPlanTransforms::removeBranchOnConst(BestVPlan);
7226-
VPlanTransforms::narrowInterleaveGroups(
7227-
BestVPlan, BestVF,
7228-
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
72297226
VPlanTransforms::removeDeadRecipes(BestVPlan);
72307227

72317228
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8336,6 +8333,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
83368333
if (CM.foldTailWithEVL() && !HasScalarVF)
83378334
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
83388335
*Plan, CM.getMaxSafeElements());
8336+
8337+
if (auto P = VPlanTransforms::narrowInterleaveGroups(
8338+
*Plan,
8339+
TTI.getRegisterBitWidth(
8340+
TargetTransformInfo::RGK_FixedWidthVector),
8341+
SubRange))
8342+
VPlans.push_back(std::move(P));
8343+
83398344
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
83408345
VPlans.push_back(std::move(Plan));
83418346
}

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,7 @@ VPlan *VPlan::duplicate() {
12011201
}
12021202
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
12031203
Old2NewVPValues[&VF] = &NewPlan->VF;
1204+
Old2NewVPValues[&UF] = &NewPlan->UF;
12041205
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
12051206
if (BackedgeTakenCount) {
12061207
NewPlan->BackedgeTakenCount = new VPValue();

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3970,6 +3970,9 @@ class VPlan {
39703970
/// Represents the vectorization factor of the loop.
39713971
VPValue VF;
39723972

3973+
/// Represents the symbolic unroll factor of the loop.
3974+
VPValue UF;
3975+
39733976
/// Represents the loop-invariant VF * UF of the vector loop region.
39743977
VPValue VFxUF;
39753978

@@ -4121,6 +4124,9 @@ class VPlan {
41214124
/// Returns the VF of the vector loop region.
41224125
VPValue &getVF() { return VF; };
41234126

4127+
/// Returns the symbolic UF of the vector loop region.
4128+
VPValue &getSymbolicUF() { return UF; };
4129+
41244130
/// Returns VF * UF of the vector loop region.
41254131
VPValue &getVFxUF() { return VFxUF; }
41264132

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3484,6 +3484,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
34843484
// used.
34853485
// TODO: Assert that they aren't used.
34863486

3487+
VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
3488+
Plan.getSymbolicUF().replaceAllUsesWith(UF);
3489+
34873490
// If there are no users of the runtime VF, compute VFxUF by constant folding
34883491
// the multiplication of VF and UF.
34893492
if (VF.getNumUsers() == 0) {
@@ -3503,7 +3506,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
35033506
}
35043507
VF.replaceAllUsesWith(RuntimeVF);
35053508

3506-
VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
35073509
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
35083510
VFxUF.replaceAllUsesWith(MulByUF);
35093511
}
@@ -3612,16 +3614,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
36123614
return RepR && RepR->isSingleScalar();
36133615
}
36143616

3615-
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
3616-
unsigned VectorRegWidth) {
3617+
std::unique_ptr<VPlan>
3618+
VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth,
3619+
VFRange &Range) {
3620+
using namespace llvm::VPlanPatternMatch;
36173621
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3622+
36183623
if (!VectorLoop)
3619-
return;
3624+
return nullptr;
36203625

36213626
VPTypeAnalysis TypeInfo(Plan);
3622-
3623-
unsigned VFMinVal = VF.getKnownMinValue();
36243627
SmallVector<VPInterleaveRecipe *> StoreGroups;
3628+
std::optional<ElementCount> VFToOptimize;
36253629
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
36263630
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
36273631
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
@@ -3636,30 +3640,38 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
36363640
// * recipes writing to memory except interleave groups
36373641
// Only support plans with a canonical induction phi.
36383642
if (R.isPhi())
3639-
return;
3643+
return nullptr;
36403644

36413645
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
36423646
if (R.mayWriteToMemory() && !InterleaveR)
3643-
return;
3644-
3645-
// Do not narrow interleave groups if there are VectorPointer recipes and
3646-
// the plan was unrolled. The recipe implicitly uses VF from
3647-
// VPTransformState.
3648-
// TODO: Remove restriction once the VF for the VectorPointer offset is
3649-
// modeled explicitly as operand.
3650-
if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
3651-
return;
3647+
return nullptr;
36523648

36533649
// All other ops are allowed, but we reject uses that cannot be converted
36543650
// when checking all allowed consumers (store interleave groups) below.
36553651
if (!InterleaveR)
36563652
continue;
36573653

3658-
// Bail out on non-consecutive interleave groups.
3659-
if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
3660-
VectorRegWidth))
3661-
return;
3662-
3654+
// Try to find a single VF, where all interleave groups are consecutive and
3655+
// saturate the full vector width. If we already have a candidate VF, check
3656+
// if it is applicable for the current InterleaveR, otherwise look for a
3657+
// suitable VF across the Plans VFs.
3658+
//
3659+
if (VFToOptimize) {
3660+
if (!isConsecutiveInterleaveGroup(InterleaveR,
3661+
VFToOptimize->getKnownMinValue(),
3662+
TypeInfo, VectorRegWidth))
3663+
return nullptr;
3664+
} else {
3665+
for (ElementCount VF : Plan.vectorFactors()) {
3666+
if (isConsecutiveInterleaveGroup(InterleaveR, VF.getKnownMinValue(),
3667+
TypeInfo, VectorRegWidth)) {
3668+
VFToOptimize = VF;
3669+
break;
3670+
}
3671+
}
3672+
if (!VFToOptimize)
3673+
return nullptr;
3674+
}
36633675
// Skip read interleave groups.
36643676
if (InterleaveR->getStoredValues().empty())
36653677
continue;
@@ -3693,24 +3705,44 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
36933705
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
36943706
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
36953707
if (!WideMember0)
3696-
return;
3708+
return nullptr;
36973709
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
36983710
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
36993711
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
37003712
R->getNumOperands() > 2)
3701-
return;
3713+
return nullptr;
37023714
if (any_of(enumerate(R->operands()),
37033715
[WideMember0, Idx = I](const auto &P) {
37043716
const auto &[OpIdx, OpV] = P;
37053717
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
37063718
}))
3707-
return;
3719+
return nullptr;
37083720
}
37093721
StoreGroups.push_back(InterleaveR);
37103722
}
37113723

37123724
if (StoreGroups.empty())
3713-
return;
3725+
return nullptr;
3726+
3727+
// All interleave groups in Plan can be narrowed for VFToOptimize. Split the
3728+
// original Plan into 2: a) a new clone which contains all VFs of Plan, except
3729+
// VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
3730+
std::unique_ptr<VPlan> NewPlan;
3731+
if (size(Plan.vectorFactors()) != 1) {
3732+
NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
3733+
Plan.setVF(*VFToOptimize);
3734+
bool First = true;
3735+
for (ElementCount VF : NewPlan->vectorFactors()) {
3736+
if (VF == VFToOptimize)
3737+
continue;
3738+
if (First) {
3739+
NewPlan->setVF(VF);
3740+
First = false;
3741+
continue;
3742+
}
3743+
NewPlan->addVF(VF);
3744+
}
3745+
}
37143746

37153747
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
37163748
auto NarrowOp = [](VPValue *V) -> VPValue * {
@@ -3777,9 +3809,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
37773809
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
37783810
VPBuilder PHBuilder(Plan.getVectorPreheader());
37793811

3780-
VPValue *UF = Plan.getOrAddLiveIn(
3781-
ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
3782-
if (VF.isScalable()) {
3812+
VPValue *UF = &Plan.getSymbolicUF();
3813+
if (VFToOptimize->isScalable()) {
37833814
VPValue *VScale = PHBuilder.createElementCount(
37843815
CanIV->getScalarType(), ElementCount::getScalable(1));
37853816
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -3791,6 +3822,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
37913822
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
37923823
}
37933824
removeDeadRecipes(Plan);
3825+
assert(none_of(*VectorLoop->getEntryBasicBlock(),
3826+
IsaPred<VPVectorPointerRecipe>) &&
3827+
"All VPVectorPointerRecipes should have been removed");
3828+
return NewPlan;
37943829
}
37953830

37963831
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -324,14 +324,19 @@ struct VPlanTransforms {
324324
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
325325
ScalarEvolution &SE);
326326

327-
/// Try to convert a plan with interleave groups with VF elements to a plan
328-
/// with the interleave groups replaced by wide loads and stores processing VF
329-
/// elements, if all transformed interleave groups access the full vector
330-
/// width (checked via \o VectorRegWidth). This effectively is a very simple
331-
/// form of loop-aware SLP, where we use interleave groups to identify
332-
/// candidates.
333-
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
334-
unsigned VectorRegWidth);
327+
/// Try to find a single VF among \p Plan's VFs for which all interleave
328+
/// groups (with VF elements) can be replaced by wide loads ans tores
329+
/// processing VF elements, if all transformed interleave groups access the
330+
/// full vector width (checked via \o VectorRegWidth). If the transformation
331+
/// can be applied, the original \p Plan will be split in 2, if is has
332+
/// multiple VFs: a) a new clone which contains all VFs of Plan, except
333+
/// VFToOptimize, and b) the original Plan with VFToOptimize as single VF. In
334+
/// that case, the new clone is returned.
335+
///
336+
/// This effectively is a very simple form of loop-aware SLP, where we use
337+
/// interleave groups to identify candidates.
338+
static std::unique_ptr<VPlan>
339+
narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth, VFRange &Range);
335340

336341
/// Predicate and linearize the control-flow in the only loop region of
337342
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -175,28 +175,18 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
175175
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
176176
; CHECK: [[VECTOR_BODY]]:
177177
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
178-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
178+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
179179
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
180180
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
181-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
182-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
183-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
184-
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
185-
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
186-
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
187-
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
188-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
181+
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
182+
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
189183
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
190184
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
191185
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
192186
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
193-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
194-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
195-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
196-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
197-
; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
198-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
199-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
187+
; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
188+
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
189+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
200190
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
201191
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
202192
; CHECK: [[MIDDLE_BLOCK]]:
@@ -237,28 +227,18 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
237227
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
238228
; CHECK: [[VECTOR_BODY]]:
239229
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
240-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
230+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
241231
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
242232
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
243-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
244-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
245-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
246-
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
247-
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
248-
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
249-
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
250-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
233+
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
234+
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
251235
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
252236
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
253237
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
254238
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
255-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
256-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
257-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
258-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
259-
; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
260-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
261-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
239+
; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
240+
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
241+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
262242
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
263243
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
264244
; CHECK: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)