Skip to content

Commit 8932db7

Browse files
committed
Include support for Add/Mul/Or/And/Xor Binary Operations
1 parent 44a3268 commit 8932db7

File tree

2 files changed

+181
-51
lines changed

2 files changed

+181
-51
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 113 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2995,14 +2995,17 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
29952995
return false;
29962996

29972997
std::queue<Value *> InstWorklist;
2998+
InstructionCost OrigCost = 0;
2999+
29983000
Value *InitEEV = nullptr;
2999-
Intrinsic::ID CommonOp = 0;
30003001

3001-
bool IsFirstCallInst = true;
3002-
bool ShouldBeCallInst = true;
3002+
unsigned int CommonCallOp = 0;
3003+
Instruction::BinaryOps CommonBinOp = Instruction::BinaryOpsEnd;
3004+
3005+
bool IsFirstCallOrBinInst = true;
3006+
bool ShouldBeCallOrBinInst = true;
30033007

30043008
SmallVector<Value *, 3> PrevVecV(3, nullptr);
3005-
int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1;
30063009
int64_t VecSize = -1;
30073010

30083011
Value *VecOp;
@@ -3014,10 +3017,18 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
30143017
return false;
30153018

30163019
VecSize = FVT->getNumElements();
3017-
if (VecSize < 2 || (VecSize % 2) != 0)
3020+
if (VecSize < 2)
30183021
return false;
30193022

3020-
ShuffleMaskHalf = 1;
3023+
unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3024+
int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3025+
3026+
for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3027+
Cur = (Cur + 1) / 2, --Mask) {
3028+
if (Cur & 1)
3029+
ExpectedParityMask |= (1ll << Mask);
3030+
}
3031+
30213032
PrevVecV[2] = VecOp;
30223033
InitEEV = EEI;
30233034

@@ -3032,24 +3043,24 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
30323043
return false;
30333044

30343045
if (auto *CallI = dyn_cast<CallInst>(CI)) {
3035-
if (!ShouldBeCallInst || !PrevVecV[2])
3046+
if (!ShouldBeCallOrBinInst || !PrevVecV[2])
30363047
return false;
30373048

3038-
if (!IsFirstCallInst &&
3049+
if (!IsFirstCallOrBinInst &&
30393050
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
30403051
return false;
30413052

3042-
if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0]))
3053+
if (CallI != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0]))
30433054
return false;
3044-
IsFirstCallInst = false;
3055+
IsFirstCallOrBinInst = false;
30453056

30463057
auto *II = dyn_cast<IntrinsicInst>(CallI);
30473058
if (!II)
30483059
return false;
30493060

3050-
if (!CommonOp)
3051-
CommonOp = II->getIntrinsicID();
3052-
if (II->getIntrinsicID() != CommonOp)
3061+
if (!CommonCallOp)
3062+
CommonCallOp = II->getIntrinsicID();
3063+
if (II->getIntrinsicID() != CommonCallOp)
30533064
return false;
30543065

30553066
switch (II->getIntrinsicID()) {
@@ -3066,14 +3077,61 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
30663077
default:
30673078
return false;
30683079
}
3069-
ShouldBeCallInst ^= 1;
3080+
ShouldBeCallOrBinInst ^= 1;
3081+
3082+
IntrinsicCostAttributes ICA(
3083+
CommonCallOp, II->getType(),
3084+
{PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3085+
OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3086+
3087+
if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3088+
std::swap(PrevVecV[0], PrevVecV[1]);
3089+
InstWorklist.push(PrevVecV[1]);
3090+
InstWorklist.push(PrevVecV[0]);
3091+
} else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3092+
if (!ShouldBeCallOrBinInst || !PrevVecV[2])
3093+
return false;
3094+
3095+
if (!IsFirstCallOrBinInst &&
3096+
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3097+
return false;
3098+
3099+
if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0]))
3100+
return false;
3101+
IsFirstCallOrBinInst = false;
3102+
3103+
if (CommonBinOp == Instruction::BinaryOpsEnd)
3104+
CommonBinOp = BinOp->getOpcode();
3105+
3106+
if (BinOp->getOpcode() != CommonBinOp)
3107+
return false;
3108+
3109+
switch (CommonBinOp) {
3110+
case BinaryOperator::Add:
3111+
case BinaryOperator::Mul:
3112+
case BinaryOperator::Or:
3113+
case BinaryOperator::And:
3114+
case BinaryOperator::Xor: {
3115+
auto *Op0 = BinOp->getOperand(0);
3116+
auto *Op1 = BinOp->getOperand(1);
3117+
PrevVecV[0] = Op0;
3118+
PrevVecV[1] = Op1;
3119+
break;
3120+
}
3121+
default:
3122+
return false;
3123+
}
3124+
ShouldBeCallOrBinInst ^= 1;
3125+
3126+
OrigCost +=
3127+
TTI.getArithmeticInstrCost(CommonBinOp, BinOp->getType(), CostKind);
30703128

30713129
if (!isa<ShuffleVectorInst>(PrevVecV[1]))
30723130
std::swap(PrevVecV[0], PrevVecV[1]);
30733131
InstWorklist.push(PrevVecV[1]);
30743132
InstWorklist.push(PrevVecV[0]);
30753133
} else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3076-
if (ShouldBeCallInst ||
3134+
if (ShouldBeCallOrBinInst ||
30773135
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
30783136
return false;
30793137

@@ -3084,33 +3142,42 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
30843142
if (!ShuffleVec || ShuffleVec != PrevVecV[0])
30853143
return false;
30863144

3145+
if (!isa<PoisonValue>(SVInst->getOperand(1)))
3146+
return false;
3147+
30873148
SmallVector<int> CurMask;
30883149
SVInst->getShuffleMask(CurMask);
30893150

3090-
if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
3091-
return false;
3092-
ExpectedShuffleMaskHalf *= 2;
3093-
30943151
for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3095-
if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
3152+
if (Mask < ShuffleMaskHalf &&
3153+
CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
30963154
return false;
30973155
if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
30983156
return false;
30993157
}
3158+
31003159
ShuffleMaskHalf *= 2;
3101-
if (ExpectedShuffleMaskHalf == VecSize)
3160+
ShuffleMaskHalf -= (ExpectedParityMask & 1);
3161+
ExpectedParityMask >>= 1;
3162+
3163+
OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3164+
SVInst->getType(), SVInst->getType(),
3165+
CurMask, CostKind);
3166+
3167+
VisitedCnt += 1;
3168+
if (!ExpectedParityMask && VisitedCnt == NumLevels)
31023169
break;
3103-
ShouldBeCallInst ^= 1;
3170+
3171+
ShouldBeCallOrBinInst ^= 1;
31043172
} else {
31053173
return false;
31063174
}
31073175
}
31083176

3109-
if (ShouldBeCallInst)
3177+
if (ShouldBeCallOrBinInst)
31103178
return false;
31113179

3112-
assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
3113-
"Expected Match for Vector Size and Mask Half");
3180+
assert(VecSize != -1 && "Expected Match for Vector Size");
31143181

31153182
Value *FinalVecV = PrevVecV[0];
31163183
auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
@@ -3121,33 +3188,28 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
31213188
assert(FinalVecVTy && "Expected non-null value for Vector Type");
31223189

31233190
Intrinsic::ID ReducedOp = 0;
3124-
switch (CommonOp) {
3125-
case Intrinsic::umin:
3126-
ReducedOp = Intrinsic::vector_reduce_umin;
3127-
break;
3128-
case Intrinsic::umax:
3129-
ReducedOp = Intrinsic::vector_reduce_umax;
3130-
break;
3131-
case Intrinsic::smin:
3132-
ReducedOp = Intrinsic::vector_reduce_smin;
3133-
break;
3134-
case Intrinsic::smax:
3135-
ReducedOp = Intrinsic::vector_reduce_smax;
3136-
break;
3137-
default:
3138-
return false;
3139-
}
3140-
3141-
InstructionCost OrigCost = 0;
3142-
unsigned int NumLevels = Log2_64(VecSize);
3143-
3144-
for (unsigned int Level = 0; Level < NumLevels; ++Level) {
3145-
OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3146-
FinalVecVTy, FinalVecVTy);
3147-
OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy);
3191+
if (CommonCallOp) {
3192+
switch (CommonCallOp) {
3193+
case Intrinsic::umin:
3194+
ReducedOp = Intrinsic::vector_reduce_umin;
3195+
break;
3196+
case Intrinsic::umax:
3197+
ReducedOp = Intrinsic::vector_reduce_umax;
3198+
break;
3199+
case Intrinsic::smin:
3200+
ReducedOp = Intrinsic::vector_reduce_smin;
3201+
break;
3202+
case Intrinsic::smax:
3203+
ReducedOp = Intrinsic::vector_reduce_smax;
3204+
break;
3205+
default:
3206+
return false;
3207+
}
3208+
} else if (CommonBinOp != Instruction::BinaryOpsEnd) {
3209+
ReducedOp = getReductionForBinop(CommonBinOp);
3210+
if (!ReducedOp)
3211+
return false;
31483212
}
3149-
OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy,
3150-
CostKind, 0);
31513213

31523214
IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
31533215
InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);

llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,52 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
1717
ret i16 %7
1818
}
1919

20+
define i16 @test_reduce_v7i16_or(<7 x i16> %a0) {
21+
; CHECK-LABEL: define i16 @test_reduce_v7i16_or(
22+
; CHECK-SAME: <7 x i16> [[A0:%.*]]) {
23+
; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v7i16(<7 x i16> [[A0]])
24+
; CHECK-NEXT: ret i16 [[TMP1]]
25+
;
26+
%1 = shufflevector <7 x i16> %a0, <7 x i16> poison, <7 x i32> <i32 3, i32 4, i32 5, i32 6, i32 poison, i32 poison, i32 poison>
27+
%2 = or <7 x i16> %a0, %1
28+
%3 = shufflevector <7 x i16> %2, <7 x i16> poison, <7 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
29+
%4 = or <7 x i16> %2, %3
30+
%5 = shufflevector <7 x i16> %4, <7 x i16> poison, <7 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
31+
%6 = or <7 x i16> %4, %5
32+
%7 = extractelement <7 x i16> %6, i64 0
33+
ret i16 %7
34+
}
35+
36+
define i16 @test_reduce_v3i16_and(<3 x i16> %a0) {
37+
; CHECK-LABEL: define i16 @test_reduce_v3i16_and(
38+
; CHECK-SAME: <3 x i16> [[A0:%.*]]) {
39+
; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.and.v3i16(<3 x i16> [[A0]])
40+
; CHECK-NEXT: ret i16 [[TMP1]]
41+
;
42+
%1 = shufflevector <3 x i16> %a0, <3 x i16> poison, <3 x i32> <i32 1, i32 2, i32 poison>
43+
%2 = and <3 x i16> %a0, %1
44+
%3 = shufflevector <3 x i16> %2, <3 x i16> poison, <3 x i32> <i32 1, i32 poison, i32 poison>
45+
%4 = and <3 x i16> %2, %3
46+
%5 = extractelement <3 x i16> %4, i64 0
47+
ret i16 %5
48+
}
49+
50+
define i16 @test_reduce_v6i16_xor(<6 x i16> %a0) {
51+
; CHECK-LABEL: define i16 @test_reduce_v6i16_xor(
52+
; CHECK-SAME: <6 x i16> [[A0:%.*]]) {
53+
; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.xor.v6i16(<6 x i16> [[A0]])
54+
; CHECK-NEXT: ret i16 [[TMP1]]
55+
;
56+
%1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
57+
%2 = xor <6 x i16> %a0, %1
58+
%3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
59+
%4 = xor <6 x i16> %2, %3
60+
%5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
61+
%6 = xor <6 x i16> %4, %5
62+
%7 = extractelement <6 x i16> %6, i64 0
63+
ret i16 %7
64+
}
65+
2066
define i16 @test_reduce_v8i16_2(<8 x i16> %a0) {
2167
; CHECK-LABEL: define i16 @test_reduce_v8i16_2(
2268
; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
@@ -125,3 +171,25 @@ define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) {
125171
%8 = extractelement <8 x i16> %7, i64 0
126172
ret i16 %8
127173
}
174+
175+
define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) {
176+
; CHECK-LABEL: define i16 @test_reduce_v6i16_xor_neg(
177+
; CHECK-SAME: <6 x i16> [[A0:%.*]]) {
178+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i16> [[A0]], <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
179+
; CHECK-NEXT: [[TMP2:%.*]] = xor <6 x i16> [[A0]], [[TMP1]]
180+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <6 x i16> [[TMP2]], <6 x i16> poison, <6 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
181+
; CHECK-NEXT: [[TMP4:%.*]] = xor <6 x i16> [[TMP2]], [[TMP3]]
182+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x i16> [[TMP4]], <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
183+
; CHECK-NEXT: [[TMP6:%.*]] = xor <6 x i16> [[TMP4]], [[TMP5]]
184+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <6 x i16> [[TMP6]], i64 0
185+
; CHECK-NEXT: ret i16 [[TMP7]]
186+
;
187+
%1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
188+
%2 = xor <6 x i16> %a0, %1
189+
%3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
190+
%4 = xor <6 x i16> %2, %3
191+
%5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
192+
%6 = xor <6 x i16> %4, %5
193+
%7 = extractelement <6 x i16> %6, i64 0
194+
ret i16 %7
195+
}

0 commit comments

Comments
 (0)