Skip to content

Commit ae7b15f

Browse files
committed
[VPlan] Return invalid for scalable VF in VPReplicateRecipe::computeCost
Replication is currently not supported for scalable VFs. Make sure VPReplicateRecipe::computeCost returns an invalid cost early, for scalable VFs if the recipe is not a single-scalar. Note that this moves the existing invalid-costs.ll out of the AArch64 subdirectory, as it does not use a target triple. Fixes llvm#160792.
1 parent ca55c07 commit ae7b15f

File tree

4 files changed

+119
-38
lines changed

4 files changed

+119
-38
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,6 +1777,9 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17771777
if (VF.isScalar())
17781778
return 0;
17791779

1780+
assert(!VF.isScalable() &&
1781+
"Scalarization overhead not supported for scalable vectors");
1782+
17801783
InstructionCost ScalarizationCost = 0;
17811784
// Compute the cost of scalarizing the result if needed.
17821785
if (!ResultTy->isVoidTy()) {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3174,6 +3174,9 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31743174
// transform, avoid computing their cost multiple times for now.
31753175
Ctx.SkipCostComputation.insert(UI);
31763176

3177+
if (VF.isScalable() && !isSingleScalar())
3178+
return InstructionCost::getInvalid();
3179+
31773180
switch (UI->getOpcode()) {
31783181
case Instruction::GetElementPtr:
31793182
// We mark this instruction as zero-cost because the cost of GEPs in
@@ -3221,9 +3224,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
32213224
return ScalarCallCost;
32223225
}
32233226

3224-
if (VF.isScalable())
3225-
return InstructionCost::getInvalid();
3226-
32273227
return ScalarCallCost * VF.getFixedValue() +
32283228
Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
32293229
}
@@ -3274,9 +3274,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
32743274
}
32753275
case Instruction::Load:
32763276
case Instruction::Store: {
3277-
if (VF.isScalable() && !isSingleScalar())
3278-
return InstructionCost::getInvalid();
3279-
32803277
// TODO: See getMemInstScalarizationCost for how to handle replicating and
32813278
// predicated cases.
32823279
const VPRegionBlock *ParentRegion = getParent()->getParent();
Lines changed: 71 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,81 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
22
; RUN: opt -passes="loop-vectorize" -pass-remarks-output=%t.yaml -S %s | FileCheck %s
33
; RUN: FileCheck --input-file=%t.yaml --check-prefix=REMARKS %s
44

5-
; REMARKS: the cost-model indicates that vectorization is not beneficial
5+
target triple = "arm64-apple-macosx"
66

7-
; Test for https://github.com/llvm/llvm-project/issues/116375.
8-
define void @test_i24_load_for(ptr noalias %src, ptr %dst) {
9-
; CHECK-LABEL: define void @test_i24_load_for(
10-
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
11-
; CHECK-NEXT: [[ENTRY:.*]]:
12-
; CHECK-NEXT: br label %[[LOOP:.*]]
13-
; CHECK: [[LOOP]]:
14-
; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
15-
; CHECK-NEXT: [[FOR:%.*]] = phi i24 [ 0, %[[ENTRY]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
16-
; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1
17-
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i24, ptr [[SRC]], i16 [[IV]]
18-
; CHECK-NEXT: [[FOR_NEXT]] = load i24, ptr [[GEP_SRC]], align 1
19-
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i24, ptr [[DST]], i16 [[IV]]
20-
; CHECK-NEXT: store i24 [[FOR]], ptr [[GEP_DST]], align 4
21-
; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
22-
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
23-
; CHECK: [[EXIT]]:
24-
; CHECK-NEXT: ret void
7+
; REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
8+
; Test case for https://github.com/llvm/llvm-project/issues/160792.
9+
define void @replicate_sdiv_conditional(ptr noalias %a, ptr noalias %b, ptr noalias %c) #0 {
10+
; CHECK-LABEL: define void @replicate_sdiv_conditional(
11+
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] {
12+
; CHECK-NEXT: [[ENTRY:.*:]]
13+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
14+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
15+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
16+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
17+
; CHECK: [[VECTOR_PH]]:
18+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
19+
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
20+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
21+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
22+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
23+
; CHECK: [[VECTOR_BODY]]:
24+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
25+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
26+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 4
27+
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
28+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
29+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> poison)
30+
; CHECK-NEXT: [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i64>
31+
; CHECK-NEXT: [[TMP8:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
32+
; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_LOAD]]
33+
; CHECK-NEXT: [[TMP10:%.*]] = sext <vscale x 4 x i32> [[TMP9]] to <vscale x 4 x i64>
34+
; CHECK-NEXT: [[TMP11:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i64> [[TMP7]], <vscale x 4 x i64> splat (i64 1)
35+
; CHECK-NEXT: [[TMP12:%.*]] = sdiv <vscale x 4 x i64> [[TMP10]], [[TMP11]]
36+
; CHECK-NEXT: [[TMP13:%.*]] = trunc <vscale x 4 x i64> [[TMP12]] to <vscale x 4 x i32>
37+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[WIDE_LOAD]]
38+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
39+
; CHECK-NEXT: store <vscale x 4 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
40+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
41+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
42+
; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
43+
; CHECK: [[MIDDLE_BLOCK]]:
44+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
45+
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
46+
; CHECK: [[SCALAR_PH]]:
2547
;
2648
entry:
27-
br label %loop
49+
br label %loop.header
2850

29-
loop:
30-
%iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
31-
%for = phi i24 [ 0, %entry ], [ %for.next, %loop ]
32-
%iv.next = add i16 %iv, 1
33-
%gep.src = getelementptr inbounds i24, ptr %src, i16 %iv
34-
%for.next = load i24, ptr %gep.src, align 1
35-
%gep.dst = getelementptr inbounds i24, ptr %dst, i16 %iv
36-
store i24 %for, ptr %gep.dst
37-
%ec = icmp eq i16 %iv.next, 1000
38-
br i1 %ec, label %exit, label %loop
51+
loop.header:
52+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
53+
%gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
54+
%val.c = load i32, ptr %gep.c, align 4
55+
%cmp = icmp slt i32 %val.c, 0
56+
br i1 %cmp, label %if.then, label %loop.latch
3957

40-
exit:
58+
if.then:
59+
%gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
60+
%val.b = load i32, ptr %gep.b, align 4
61+
%sext = sext i32 %val.b to i64
62+
%shr = ashr i32 %val.b, 1
63+
%add = add i32 %shr, %val.c
64+
%conv = sext i32 %add to i64
65+
%div = sdiv i64 %conv, %sext
66+
%trunc = trunc i64 %div to i32
67+
br label %loop.latch
68+
69+
loop.latch:
70+
%result = phi i32 [ %trunc, %if.then ], [ %val.c, %loop.header ]
71+
%gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
72+
store i32 %result, ptr %gep.a, align 4
73+
%iv.next = add nuw nsw i64 %iv, 1
74+
%exit = icmp eq i64 %iv.next, 64
75+
br i1 %exit, label %for.end, label %loop.header
76+
77+
for.end:
4178
ret void
4279
}
80+
81+
attributes #0 = { "target-cpu"="neoverse-512tvb" }
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -passes="loop-vectorize" -pass-remarks-output=%t.yaml -S %s | FileCheck %s
3+
; RUN: FileCheck --input-file=%t.yaml --check-prefix=REMARKS %s
4+
5+
; REMARKS: the cost-model indicates that vectorization is not beneficial
6+
7+
; Test for https://github.com/llvm/llvm-project/issues/116375.
8+
define void @test_i24_load_for(ptr noalias %src, ptr %dst) {
9+
; CHECK-LABEL: define void @test_i24_load_for(
10+
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
11+
; CHECK-NEXT: [[ENTRY:.*]]:
12+
; CHECK-NEXT: br label %[[LOOP:.*]]
13+
; CHECK: [[LOOP]]:
14+
; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
15+
; CHECK-NEXT: [[FOR:%.*]] = phi i24 [ 0, %[[ENTRY]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
16+
; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1
17+
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i24, ptr [[SRC]], i16 [[IV]]
18+
; CHECK-NEXT: [[FOR_NEXT]] = load i24, ptr [[GEP_SRC]], align 1
19+
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i24, ptr [[DST]], i16 [[IV]]
20+
; CHECK-NEXT: store i24 [[FOR]], ptr [[GEP_DST]], align 4
21+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
22+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
23+
; CHECK: [[EXIT]]:
24+
; CHECK-NEXT: ret void
25+
;
26+
entry:
27+
br label %loop
28+
29+
loop:
30+
%iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
31+
%for = phi i24 [ 0, %entry ], [ %for.next, %loop ]
32+
%iv.next = add i16 %iv, 1
33+
%gep.src = getelementptr inbounds i24, ptr %src, i16 %iv
34+
%for.next = load i24, ptr %gep.src, align 1
35+
%gep.dst = getelementptr inbounds i24, ptr %dst, i16 %iv
36+
store i24 %for, ptr %gep.dst
37+
%ec = icmp eq i16 %iv.next, 1000
38+
br i1 %ec, label %exit, label %loop
39+
40+
exit:
41+
ret void
42+
}

0 commit comments

Comments
 (0)