diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index c3a20b5044c5f..140c97ccd90ba 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6483,15 +6483,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, // Try to use leading zeros of the dividend to reduce the multiplier and // avoid expensive fixups. - // TODO: Support vectors. - unsigned LeadingZeros = 0; - if (!VT.isVector() && isa(N1)) { - assert(!isOneConstant(N1) && "Unexpected divisor"); - LeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros(); - // UnsignedDivisionByConstantInfo doesn't work correctly if leading zeros in - // the dividend exceeds the leading zeros for the divisor. - LeadingZeros = std::min(LeadingZeros, N1->getAsAPIntVal().countl_zero()); - } + unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros(); bool UseNPQ = false, UsePreShift = false, UsePostShift = false; SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; @@ -6510,7 +6502,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, MagicFactor = NPQFactor = DAG.getUNDEF(SVT); } else { UnsignedDivisionByConstantInfo magics = - UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros); + UnsignedDivisionByConstantInfo::get( + Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero())); MagicFactor = DAG.getConstant(magics.Magic, dl, SVT); diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index e429ac0c63c2d..d5a481549f851 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -755,3 +755,64 @@ define <4 x i1> @boolvec_udiv(<4 x i1> %x, <4 x i1> %y) { %r = udiv <4 x i1> %x, %y ret <4 x i1> %r } + +define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) { +; SSE2-LABEL: vector_div_leading_zeros: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: vector_div_leading_zeros: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: pmuludq %xmm2, %xmm1 +; SSE41-NEXT: pmuludq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vector_div_leading_zeros: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vector_div_leading_zeros: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq +; +; XOP-LABEL: vector_div_leading_zeros: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; XOP-NEXT: retq + %a = and <4 x i32> %x, + %b = udiv <4 x i32> %a, + ret <4 x i32> %b +} diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll index 886c3ae10324d..9e398096bfcc5 100644 --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -230,7 +230,7 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32 ; SSE2-LABEL: p7_vector_urem_by_const__nonsplat_undef2: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -238,7 +238,6 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32 ; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psrld $2, %xmm2 ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [6,0,6,0,6,0,6,0] ; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -249,12 +248,11 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32 ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [715827883,715827883,715827883,715827883] ; SSE4-NEXT: pmuludq %xmm2, %xmm1 ; SSE4-NEXT: pmuludq %xmm0, %xmm2 ; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE4-NEXT: psrld $2, %xmm2 ; SSE4-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [6,0,6,0,6,0,6,0] ; SSE4-NEXT: psubd %xmm2, %xmm0 ; SSE4-NEXT: pxor %xmm1, %xmm1 @@ -266,12 +264,11 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [715827883,715827883,715827883,715827883] ; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 ; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [6,0,6,0,6,0,6,0] ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1