[VPlan] Return invalid for scalable VF in VPReplicateRecipe::computeCost

fhahn · fhahn · commit ae7b15f2e29c · 2025-10-11T19:28:02.000+01:00
Replication is currently not supported for scalable VFs. Make sure VPReplicateRecipe::computeCost returns an invalid cost early, for scalable VFs if the recipe is not a single-scalar. Note that this moves the existing invalid-costs.ll out of the AArch64 subdirectory, as it does not use a target triple. Fixes llvm#160792.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1777,6 +1777,9 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   if (VF.isScalar())
     return 0;
 
+  assert(!VF.isScalable() &&
+         "Scalarization overhead not supported for scalable vectors");
+
   InstructionCost ScalarizationCost = 0;
   // Compute the cost of scalarizing the result if needed.
   if (!ResultTy->isVoidTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3174,6 +3174,9 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   // transform, avoid computing their cost multiple times for now.
   Ctx.SkipCostComputation.insert(UI);
 
+  if (VF.isScalable() && !isSingleScalar())
+    return InstructionCost::getInvalid();
+
   switch (UI->getOpcode()) {
   case Instruction::GetElementPtr:
     // We mark this instruction as zero-cost because the cost of GEPs in
@@ -3221,9 +3224,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
       return ScalarCallCost;
     }
 
-    if (VF.isScalable())
-      return InstructionCost::getInvalid();
-
     return ScalarCallCost * VF.getFixedValue() +
            Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
   }
@@ -3274,9 +3274,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   }
   case Instruction::Load:
   case Instruction::Store: {
-    if (VF.isScalable() && !isSingleScalar())
-      return InstructionCost::getInvalid();
-
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
     const VPRegionBlock *ParentRegion = getParent()->getParent();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
@@ -1,42 +1,81 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
 ; RUN: opt -passes="loop-vectorize" -pass-remarks-output=%t.yaml -S %s | FileCheck %s
 ; RUN: FileCheck --input-file=%t.yaml --check-prefix=REMARKS %s
 
-; REMARKS: the cost-model indicates that vectorization is not beneficial
+target triple = "arm64-apple-macosx"
 
-; Test for https://github.com/llvm/llvm-project/issues/116375.
-define void @test_i24_load_for(ptr noalias %src, ptr %dst) {
-; CHECK-LABEL: define void @test_i24_load_for(
-; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i24 [ 0, %[[ENTRY]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i24, ptr [[SRC]], i16 [[IV]]
-; CHECK-NEXT:    [[FOR_NEXT]] = load i24, ptr [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i24, ptr [[DST]], i16 [[IV]]
-; CHECK-NEXT:    store i24 [[FOR]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
+; Test case for https://github.com/llvm/llvm-project/issues/160792.
+define void @replicate_sdiv_conditional(ptr noalias %a, ptr noalias %b, ptr noalias %c) #0 {
+; CHECK-LABEL: define void @replicate_sdiv_conditional(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <vscale x 4 x i32> [[TMP9]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i64> [[TMP7]], <vscale x 4 x i64> splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = sdiv <vscale x 4 x i64> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <vscale x 4 x i64> [[TMP12]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
-  br label %loop
+  br label %loop.header
 
-loop:
-  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
-  %for = phi i24 [ 0, %entry ], [ %for.next, %loop ]
-  %iv.next = add i16 %iv, 1
-  %gep.src = getelementptr inbounds i24, ptr %src, i16 %iv
-  %for.next = load i24, ptr %gep.src, align 1
-  %gep.dst = getelementptr inbounds i24, ptr %dst, i16 %iv
-  store i24 %for, ptr %gep.dst
-  %ec = icmp eq i16 %iv.next, 1000
-  br i1 %ec, label %exit, label %loop
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  %val.c = load i32, ptr %gep.c, align 4
+  %cmp = icmp slt i32 %val.c, 0
+  br i1 %cmp, label %if.then, label %loop.latch
 
-exit:
+if.then:
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %val.b = load i32, ptr %gep.b, align 4
+  %sext = sext i32 %val.b to i64
+  %shr = ashr i32 %val.b, 1
+  %add = add i32 %shr, %val.c
+  %conv = sext i32 %add to i64
+  %div = sdiv i64 %conv, %sext
+  %trunc = trunc i64 %div to i32
+  br label %loop.latch
+
+loop.latch:
+  %result = phi i32 [ %trunc, %if.then ], [ %val.c, %loop.header ]
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %result, ptr %gep.a, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit = icmp eq i64 %iv.next, 64
+  br i1 %exit, label %for.end, label %loop.header
+
+for.end:
   ret void
 }
+
+attributes #0 = { "target-cpu"="neoverse-512tvb" }
diff --git a/llvm/test/Transforms/LoopVectorize/invalid-costs.ll b/llvm/test/Transforms/LoopVectorize/invalid-costs.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="loop-vectorize" -pass-remarks-output=%t.yaml -S %s | FileCheck %s
+; RUN: FileCheck --input-file=%t.yaml --check-prefix=REMARKS %s
+
+; REMARKS: the cost-model indicates that vectorization is not beneficial
+
+; Test for https://github.com/llvm/llvm-project/issues/116375.
+define void @test_i24_load_for(ptr noalias %src, ptr %dst) {
+; CHECK-LABEL: define void @test_i24_load_for(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i24 [ 0, %[[ENTRY]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i24, ptr [[SRC]], i16 [[IV]]
+; CHECK-NEXT:    [[FOR_NEXT]] = load i24, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i24, ptr [[DST]], i16 [[IV]]
+; CHECK-NEXT:    store i24 [[FOR]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i24 [ 0, %entry ], [ %for.next, %loop ]
+  %iv.next = add i16 %iv, 1
+  %gep.src = getelementptr inbounds i24, ptr %src, i16 %iv
+  %for.next = load i24, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i24, ptr %dst, i16 %iv
+  store i24 %for, ptr %gep.dst
+  %ec = icmp eq i16 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}