diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c05e4b822d2c4..f30a76af83279 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10135,13 +10135,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } SmallVector Mask; E->buildAltOpShuffleMask( - [E](Instruction *I) { + [&](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return I->getOpcode() == E->getAltOpcode(); + return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), + *TLI); }, Mask); VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + FinalVecTy, Mask, CostKind); // Patterns like [fadd,fsub] can be combined into a single instruction // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we // need to take into account their order when looking for the most used diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll index 9c3e1fbec663b..c97d8da58be44 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -86,16 +86,17 @@ return: define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_merge_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T_FR]], -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4 -; CHECK-NEXT: [[OP_RDX_NOT:%.*]] = icmp eq i4 [[TMP3]], 0 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[T_FR]], [[SHIFT]] -; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i64 0 -; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OP_RDX_NOT]], float [[ADD]], float 0.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x float> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[SHIFT]], [[T]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP6]], i64 0 +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00 ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: @@ -400,16 +401,18 @@ return: define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-LABEL: @test_merge_anyof_v4si( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[T_FR]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i1> [[TMP1]] to i4 -; CHECK-NEXT: [[OP_RDX_NOT:%.*]] = icmp eq i4 [[TMP2]], 0 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[T_FR]], [[SHIFT]] -; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP6]], i64 0 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float -; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OP_RDX_NOT]], float [[CONV]], float 0.000000e+00 +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00 ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index c25e07cfd451f..865d817866716 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -388,28 +388,16 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_pred_diff( -; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; SSE-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] -; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) -; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false -; SSE-NEXT: ret i1 [[OP_RDX]] -; -; AVX-LABEL: @logical_and_icmp_clamp_pred_diff( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> -; AVX-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) -; AVX-NEXT: ret i1 [[TMP8]] +; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) +; CHECK-NEXT: ret i1 [[TMP8]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1