Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7752,6 +7752,12 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands);
VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
if (auto *Inst = dyn_cast<VPInstruction>(Mask)) {
if (Inst->getOpcode() == VPInstruction::LogicalAnd) {
VPValue *Zero = Plan.getConstantInt(I->getType(), 0);
Mask = Builder.createICmp(CmpInst::ICMP_NE, Ops[1], Zero);
}
}
Comment on lines 7754 to +7760
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why we need to only do this when the mask is an and? I think it should be correct to always just replace the SafeRHS with (select (icmp ne rhs, 0), rhs, 1) for any mask

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can give it a try. I was focusing on RISC-V specific conditions at the time, so I added this restriction. thank.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't add the restriction for the mask and, I see some regression cases. eg:

diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 5f54b0ac7834..8b2e85dbd29e 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -13,7 +13,9 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <4 x i32> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT:    [[C:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[C]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]

VPValue *One = Plan.getConstantInt(I->getType(), 1u);
auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
Ops[1] = SafeRHS;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -660,16 +660,16 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
; COMMON-NEXT: br label %[[EXIT]]
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; COMMON: [[PRED_STORE_CONTINUE14]]:
; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; COMMON: [[MIDDLE_BLOCK]]:
; COMMON-NEXT: br label %[[EXIT:.*]]
; COMMON: [[EXIT]]:
; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
; COMMON: [[SCALAR_PH]]:
; COMMON-NEXT: br label %[[EXIT1:.*]]
; COMMON: [[EXIT1]]:
; COMMON-NEXT: ret void
;
entry:
Expand Down Expand Up @@ -1303,7 +1303,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; PRED: [[VECTOR_MEMCHECK]]:
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[C1]], [[A2]]
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
; PRED-NEXT: [[TMP4:%.*]] = sub i64 [[C1]], [[B3]]
Expand All @@ -1312,42 +1312,42 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; PRED-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
; PRED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[Y]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
; PRED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4
; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2
; PRED-NEXT: [[TMP9:%.*]] = sub i64 [[TMP0]], [[TMP8]]
; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], [[TMP8]]
; PRED-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]])
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[Y]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
; PRED-NEXT: [[TMP16:%.*]] = icmp ne <vscale x 4 x i8> [[BROADCAST_SPLAT]], zeroinitializer
; PRED-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i8> [[BROADCAST_SPLAT]], <vscale x 4 x i8> splat (i8 1)
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
; PRED: [[VECTOR_BODY]]:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
; PRED-NEXT: [[TMP13:%.*]] = uitofp <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x float>
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
; PRED-NEXT: [[TMP15:%.*]] = uitofp <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x float>
; PRED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
; PRED-NEXT: [[TMP15:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
; PRED-NEXT: [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
; PRED-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[TMP16]], <vscale x 16 x i8> [[BROADCAST_SPLAT]], <vscale x 16 x i8> splat (i8 1)
; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 16 x i8> [[TMP17]], [[TMP18]]
; PRED-NEXT: [[TMP20:%.*]] = icmp ugt <vscale x 16 x i8> [[TMP19]], splat (i8 1)
; PRED-NEXT: [[TMP21:%.*]] = select <vscale x 16 x i1> [[TMP20]], <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> splat (i32 255)
; PRED-NEXT: [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i32> [[TMP21]], <vscale x 16 x i32> zeroinitializer
; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
; PRED-NEXT: [[TMP23:%.*]] = sub <vscale x 16 x i32> [[PREDPHI]], [[TMP22]]
; PRED-NEXT: [[TMP24:%.*]] = sitofp <vscale x 16 x i32> [[TMP23]] to <vscale x 16 x float>
; PRED-NEXT: [[TMP25:%.*]] = call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> [[TMP24]], <vscale x 16 x float> splat (float 3.000000e+00), <vscale x 16 x float> [[TMP13]])
; PRED-NEXT: [[TMP26:%.*]] = fptoui <vscale x 16 x float> [[TMP25]] to <vscale x 16 x i8>
; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
; PRED-NEXT: [[TMP17:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i8> [[TMP18]], [[TMP13]]
; PRED-NEXT: [[TMP20:%.*]] = icmp ugt <vscale x 4 x i8> [[TMP19]], splat (i8 1)
; PRED-NEXT: [[TMP21:%.*]] = select <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> splat (i32 255)
; PRED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP21]]
; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
; PRED-NEXT: [[TMP23:%.*]] = sub <vscale x 4 x i32> [[PREDPHI]], [[TMP22]]
; PRED-NEXT: [[TMP24:%.*]] = sitofp <vscale x 4 x i32> [[TMP23]] to <vscale x 4 x float>
; PRED-NEXT: [[TMP25:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP24]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[TMP15]])
; PRED-NEXT: [[TMP26:%.*]] = fptoui <vscale x 4 x float> [[TMP25]] to <vscale x 4 x i8>
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]]
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr align 1 [[TMP27]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
; PRED-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP26]], ptr align 1 [[TMP27]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
; PRED-NEXT: [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP11]])
; PRED-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP29:%.*]] = xor i1 [[TMP28]], true
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[CONV6]], 0
; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP16]], i64 [[CONV6]], i64 1
; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP19]], [[CONV61]]
; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[X]], [[TMP20]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
Expand All @@ -128,14 +134,8 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP22:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i1> zeroinitializer
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x i1> [[TMP23]], i32 0
; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CONV6]], i64 1
; CHECK-NEXT: [[TMP26:%.*]] = sdiv i64 [[M]], [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP26]], [[CONV61]]
; CHECK-NEXT: [[TMP29:%.*]] = sub i64 [[INDEX]], [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[X]], [[TMP27]]
; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], [[TMP30]]
; CHECK-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
Expand Down
Loading