Skip to content

Commit 851f8f7

Browse files
authored
[VPlan] Disable partial reductions again with EVL tail folding (#167863)
VPPartialReductionRecipe doesn't yet support an EVL variant, and we guard against this by not calling convertToAbstractRecipes when we're tail folding with EVL. However recently some things got shuffled around which means we may detect some scaled reductions in collectScaledReductions and store them in ScaledReductionMap, where outside of convertToAbstractRecipes we may look them up and start e.g. adding a scale factor to an otherwise regular VPReductionPHI. This fixes it by skipping collectScaledReductions, and fixes #167861
1 parent 675bf80 commit 851f8f7

File tree

2 files changed

+128
-1
lines changed

2 files changed

+128
-1
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8395,7 +8395,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
83958395
// ---------------------------------------------------------------------------
83968396
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
83978397
Builder, BlockMaskCache, LVer);
8398-
RecipeBuilder.collectScaledReductions(Range);
8398+
// TODO: Handle partial reductions with EVL tail folding.
8399+
if (!CM.foldTailWithEVL())
8400+
RecipeBuilder.collectScaledReductions(Range);
83998401

84008402
// Scan the body of the loop in a topological order to visit each basic block
84018403
// after having visited its predecessor basic blocks.

llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ
44
; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V
55
; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ
6+
; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,TAILFOLD
67

78
; TODO: Remove -prefer-predicate-over-epilogue=scalar-epilogue when partial reductions with EVL tail folding is supported.
89

@@ -147,6 +148,37 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
147148
; FIXED-ZVQDOTQ: for.exit:
148149
; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]]
149150
;
151+
; TAILFOLD-LABEL: define i32 @vqdot(
152+
; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
153+
; TAILFOLD-NEXT: entry:
154+
; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]]
155+
; TAILFOLD: vector.ph:
156+
; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]]
157+
; TAILFOLD: vector.body:
158+
; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
159+
; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
160+
; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
161+
; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
162+
; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
163+
; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
164+
; TAILFOLD-NEXT: [[TMP2:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
165+
; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
166+
; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
167+
; TAILFOLD-NEXT: [[TMP4:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
168+
; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
169+
; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
170+
; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
171+
; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
172+
; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
173+
; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
174+
; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
175+
; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
176+
; TAILFOLD: middle.block:
177+
; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
178+
; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]]
179+
; TAILFOLD: for.exit:
180+
; TAILFOLD-NEXT: ret i32 [[TMP10]]
181+
;
150182
entry:
151183
br label %for.body
152184

@@ -309,6 +341,37 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
309341
; FIXED-ZVQDOTQ: for.exit:
310342
; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]]
311343
;
344+
; TAILFOLD-LABEL: define i32 @vqdotu(
345+
; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
346+
; TAILFOLD-NEXT: entry:
347+
; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]]
348+
; TAILFOLD: vector.ph:
349+
; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]]
350+
; TAILFOLD: vector.body:
351+
; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
352+
; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
353+
; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
354+
; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
355+
; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
356+
; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
357+
; TAILFOLD-NEXT: [[TMP2:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
358+
; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
359+
; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
360+
; TAILFOLD-NEXT: [[TMP4:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
361+
; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
362+
; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
363+
; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
364+
; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
365+
; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
366+
; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
367+
; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
368+
; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
369+
; TAILFOLD: middle.block:
370+
; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
371+
; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]]
372+
; TAILFOLD: for.exit:
373+
; TAILFOLD-NEXT: ret i32 [[TMP10]]
374+
;
312375
entry:
313376
br label %for.body
314377

@@ -471,6 +534,37 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
471534
; FIXED-ZVQDOTQ: for.exit:
472535
; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]]
473536
;
537+
; TAILFOLD-LABEL: define i32 @vqdotsu(
538+
; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
539+
; TAILFOLD-NEXT: entry:
540+
; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]]
541+
; TAILFOLD: vector.ph:
542+
; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]]
543+
; TAILFOLD: vector.body:
544+
; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
545+
; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
546+
; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
547+
; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
548+
; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
549+
; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
550+
; TAILFOLD-NEXT: [[TMP2:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
551+
; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
552+
; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
553+
; TAILFOLD-NEXT: [[TMP4:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
554+
; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
555+
; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
556+
; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
557+
; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
558+
; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
559+
; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
560+
; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
561+
; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
562+
; TAILFOLD: middle.block:
563+
; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
564+
; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]]
565+
; TAILFOLD: for.exit:
566+
; TAILFOLD-NEXT: ret i32 [[TMP10]]
567+
;
474568
entry:
475569
br label %for.body
476570

@@ -632,6 +726,37 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
632726
; FIXED-ZVQDOTQ: for.exit:
633727
; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]]
634728
;
729+
; TAILFOLD-LABEL: define i32 @vqdotsu2(
730+
; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
731+
; TAILFOLD-NEXT: entry:
732+
; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]]
733+
; TAILFOLD: vector.ph:
734+
; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]]
735+
; TAILFOLD: vector.body:
736+
; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
737+
; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
738+
; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
739+
; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
740+
; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
741+
; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
742+
; TAILFOLD-NEXT: [[TMP2:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
743+
; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
744+
; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
745+
; TAILFOLD-NEXT: [[TMP4:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
746+
; TAILFOLD-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
747+
; TAILFOLD-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
748+
; TAILFOLD-NEXT: [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
749+
; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
750+
; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
751+
; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
752+
; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
753+
; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
754+
; TAILFOLD: middle.block:
755+
; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
756+
; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]]
757+
; TAILFOLD: for.exit:
758+
; TAILFOLD-NEXT: ret i32 [[TMP10]]
759+
;
635760
entry:
636761
br label %for.body
637762

0 commit comments

Comments
 (0)