@@ -2995,14 +2995,17 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
2995
2995
return false ;
2996
2996
2997
2997
std::queue<Value *> InstWorklist;
2998
+ InstructionCost OrigCost = 0 ;
2999
+
2998
3000
Value *InitEEV = nullptr ;
2999
- Intrinsic::ID CommonOp = 0 ;
3000
3001
3001
- bool IsFirstCallInst = true ;
3002
- bool ShouldBeCallInst = true ;
3002
+ unsigned int CommonCallOp = 0 ;
3003
+ Instruction::BinaryOps CommonBinOp = Instruction::BinaryOpsEnd;
3004
+
3005
+ bool IsFirstCallOrBinInst = true ;
3006
+ bool ShouldBeCallOrBinInst = true ;
3003
3007
3004
3008
SmallVector<Value *, 3 > PrevVecV (3 , nullptr );
3005
- int64_t ShuffleMaskHalf = -1 , ExpectedShuffleMaskHalf = 1 ;
3006
3009
int64_t VecSize = -1 ;
3007
3010
3008
3011
Value *VecOp;
@@ -3014,10 +3017,18 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3014
3017
return false ;
3015
3018
3016
3019
VecSize = FVT->getNumElements ();
3017
- if (VecSize < 2 || (VecSize % 2 ) != 0 )
3020
+ if (VecSize < 2 )
3018
3021
return false ;
3019
3022
3020
- ShuffleMaskHalf = 1 ;
3023
+ unsigned int NumLevels = Log2_64_Ceil (VecSize), VisitedCnt = 0 ;
3024
+ int64_t ShuffleMaskHalf = 1 , ExpectedParityMask = 0 ;
3025
+
3026
+ for (int Cur = VecSize, Mask = NumLevels - 1 ; Cur > 1 ;
3027
+ Cur = (Cur + 1 ) / 2 , --Mask) {
3028
+ if (Cur & 1 )
3029
+ ExpectedParityMask |= (1ll << Mask);
3030
+ }
3031
+
3021
3032
PrevVecV[2 ] = VecOp;
3022
3033
InitEEV = EEI;
3023
3034
@@ -3032,24 +3043,24 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3032
3043
return false ;
3033
3044
3034
3045
if (auto *CallI = dyn_cast<CallInst>(CI)) {
3035
- if (!ShouldBeCallInst || !PrevVecV[2 ])
3046
+ if (!ShouldBeCallOrBinInst || !PrevVecV[2 ])
3036
3047
return false ;
3037
3048
3038
- if (!IsFirstCallInst &&
3049
+ if (!IsFirstCallOrBinInst &&
3039
3050
any_of (PrevVecV, [](Value *VecV) { return VecV == nullptr ; }))
3040
3051
return false ;
3041
3052
3042
- if (CallI != (IsFirstCallInst ? PrevVecV[2 ] : PrevVecV[0 ]))
3053
+ if (CallI != (IsFirstCallOrBinInst ? PrevVecV[2 ] : PrevVecV[0 ]))
3043
3054
return false ;
3044
- IsFirstCallInst = false ;
3055
+ IsFirstCallOrBinInst = false ;
3045
3056
3046
3057
auto *II = dyn_cast<IntrinsicInst>(CallI);
3047
3058
if (!II)
3048
3059
return false ;
3049
3060
3050
- if (!CommonOp )
3051
- CommonOp = II->getIntrinsicID ();
3052
- if (II->getIntrinsicID () != CommonOp )
3061
+ if (!CommonCallOp )
3062
+ CommonCallOp = II->getIntrinsicID ();
3063
+ if (II->getIntrinsicID () != CommonCallOp )
3053
3064
return false ;
3054
3065
3055
3066
switch (II->getIntrinsicID ()) {
@@ -3066,14 +3077,61 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3066
3077
default :
3067
3078
return false ;
3068
3079
}
3069
- ShouldBeCallInst ^= 1 ;
3080
+ ShouldBeCallOrBinInst ^= 1 ;
3081
+
3082
+ IntrinsicCostAttributes ICA (
3083
+ CommonCallOp, II->getType (),
3084
+ {PrevVecV[0 ]->getType (), PrevVecV[1 ]->getType ()});
3085
+ OrigCost += TTI.getIntrinsicInstrCost (ICA, CostKind);
3086
+
3087
+ if (!isa<ShuffleVectorInst>(PrevVecV[1 ]))
3088
+ std::swap (PrevVecV[0 ], PrevVecV[1 ]);
3089
+ InstWorklist.push (PrevVecV[1 ]);
3090
+ InstWorklist.push (PrevVecV[0 ]);
3091
+ } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3092
+ if (!ShouldBeCallOrBinInst || !PrevVecV[2 ])
3093
+ return false ;
3094
+
3095
+ if (!IsFirstCallOrBinInst &&
3096
+ any_of (PrevVecV, [](Value *VecV) { return VecV == nullptr ; }))
3097
+ return false ;
3098
+
3099
+ if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2 ] : PrevVecV[0 ]))
3100
+ return false ;
3101
+ IsFirstCallOrBinInst = false ;
3102
+
3103
+ if (CommonBinOp == Instruction::BinaryOpsEnd)
3104
+ CommonBinOp = BinOp->getOpcode ();
3105
+
3106
+ if (BinOp->getOpcode () != CommonBinOp)
3107
+ return false ;
3108
+
3109
+ switch (CommonBinOp) {
3110
+ case BinaryOperator::Add:
3111
+ case BinaryOperator::Mul:
3112
+ case BinaryOperator::Or:
3113
+ case BinaryOperator::And:
3114
+ case BinaryOperator::Xor: {
3115
+ auto *Op0 = BinOp->getOperand (0 );
3116
+ auto *Op1 = BinOp->getOperand (1 );
3117
+ PrevVecV[0 ] = Op0;
3118
+ PrevVecV[1 ] = Op1;
3119
+ break ;
3120
+ }
3121
+ default :
3122
+ return false ;
3123
+ }
3124
+ ShouldBeCallOrBinInst ^= 1 ;
3125
+
3126
+ OrigCost +=
3127
+ TTI.getArithmeticInstrCost (CommonBinOp, BinOp->getType (), CostKind);
3070
3128
3071
3129
if (!isa<ShuffleVectorInst>(PrevVecV[1 ]))
3072
3130
std::swap (PrevVecV[0 ], PrevVecV[1 ]);
3073
3131
InstWorklist.push (PrevVecV[1 ]);
3074
3132
InstWorklist.push (PrevVecV[0 ]);
3075
3133
} else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3076
- if (ShouldBeCallInst ||
3134
+ if (ShouldBeCallOrBinInst ||
3077
3135
any_of (PrevVecV, [](Value *VecV) { return VecV == nullptr ; }))
3078
3136
return false ;
3079
3137
@@ -3084,33 +3142,42 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3084
3142
if (!ShuffleVec || ShuffleVec != PrevVecV[0 ])
3085
3143
return false ;
3086
3144
3145
+ if (!isa<PoisonValue>(SVInst->getOperand (1 )))
3146
+ return false ;
3147
+
3087
3148
SmallVector<int > CurMask;
3088
3149
SVInst->getShuffleMask (CurMask);
3089
3150
3090
- if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
3091
- return false ;
3092
- ExpectedShuffleMaskHalf *= 2 ;
3093
-
3094
3151
for (int Mask = 0 , MaskSize = CurMask.size (); Mask != MaskSize; ++Mask) {
3095
- if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
3152
+ if (Mask < ShuffleMaskHalf &&
3153
+ CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1 ))
3096
3154
return false ;
3097
3155
if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1 )
3098
3156
return false ;
3099
3157
}
3158
+
3100
3159
ShuffleMaskHalf *= 2 ;
3101
- if (ExpectedShuffleMaskHalf == VecSize)
3160
+ ShuffleMaskHalf -= (ExpectedParityMask & 1 );
3161
+ ExpectedParityMask >>= 1 ;
3162
+
3163
+ OrigCost += TTI.getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc,
3164
+ SVInst->getType (), SVInst->getType (),
3165
+ CurMask, CostKind);
3166
+
3167
+ VisitedCnt += 1 ;
3168
+ if (!ExpectedParityMask && VisitedCnt == NumLevels)
3102
3169
break ;
3103
- ShouldBeCallInst ^= 1 ;
3170
+
3171
+ ShouldBeCallOrBinInst ^= 1 ;
3104
3172
} else {
3105
3173
return false ;
3106
3174
}
3107
3175
}
3108
3176
3109
- if (ShouldBeCallInst )
3177
+ if (ShouldBeCallOrBinInst )
3110
3178
return false ;
3111
3179
3112
- assert (VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
3113
- " Expected Match for Vector Size and Mask Half" );
3180
+ assert (VecSize != -1 && " Expected Match for Vector Size" );
3114
3181
3115
3182
Value *FinalVecV = PrevVecV[0 ];
3116
3183
auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType ());
@@ -3121,33 +3188,28 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3121
3188
assert (FinalVecVTy && " Expected non-null value for Vector Type" );
3122
3189
3123
3190
Intrinsic::ID ReducedOp = 0 ;
3124
- switch (CommonOp) {
3125
- case Intrinsic::umin:
3126
- ReducedOp = Intrinsic::vector_reduce_umin;
3127
- break ;
3128
- case Intrinsic::umax:
3129
- ReducedOp = Intrinsic::vector_reduce_umax;
3130
- break ;
3131
- case Intrinsic::smin:
3132
- ReducedOp = Intrinsic::vector_reduce_smin;
3133
- break ;
3134
- case Intrinsic::smax:
3135
- ReducedOp = Intrinsic::vector_reduce_smax;
3136
- break ;
3137
- default :
3138
- return false ;
3139
- }
3140
-
3141
- InstructionCost OrigCost = 0 ;
3142
- unsigned int NumLevels = Log2_64 (VecSize);
3143
-
3144
- for (unsigned int Level = 0 ; Level < NumLevels; ++Level) {
3145
- OrigCost += TTI.getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc,
3146
- FinalVecVTy, FinalVecVTy);
3147
- OrigCost += TTI.getArithmeticInstrCost (Instruction::ICmp, FinalVecVTy);
3191
+ if (CommonCallOp) {
3192
+ switch (CommonCallOp) {
3193
+ case Intrinsic::umin:
3194
+ ReducedOp = Intrinsic::vector_reduce_umin;
3195
+ break ;
3196
+ case Intrinsic::umax:
3197
+ ReducedOp = Intrinsic::vector_reduce_umax;
3198
+ break ;
3199
+ case Intrinsic::smin:
3200
+ ReducedOp = Intrinsic::vector_reduce_smin;
3201
+ break ;
3202
+ case Intrinsic::smax:
3203
+ ReducedOp = Intrinsic::vector_reduce_smax;
3204
+ break ;
3205
+ default :
3206
+ return false ;
3207
+ }
3208
+ } else if (CommonBinOp != Instruction::BinaryOpsEnd) {
3209
+ ReducedOp = getReductionForBinop (CommonBinOp);
3210
+ if (!ReducedOp)
3211
+ return false ;
3148
3212
}
3149
- OrigCost += TTI.getVectorInstrCost (Instruction::ExtractElement, FinalVecVTy,
3150
- CostKind, 0 );
3151
3213
3152
3214
IntrinsicCostAttributes ICA (ReducedOp, FinalVecVTy, {FinalVecV});
3153
3215
InstructionCost NewCost = TTI.getIntrinsicInstrCost (ICA, CostKind);
0 commit comments