From 7628b3c0248198b659ea753eb0334bd83d768807 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 1 Mar 2024 17:08:19 +0000 Subject: [PATCH] [X86] Convert logicalshift(x, C) -> and(x, M) iff x is allsignbits If we're shift an all-signbits value, then we can just mask out the shifted bits. This helps removes some unnecessary bitcasted vXi16 shifts used for vXi8 shifts (which SimplifyDemandedBits will struggle to remove through the bitcast), and allows some AVX1 shifts of 256-bit values to stay as a YMM instruction. Noticed in codegen from #82290 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +- .../X86/bitcast-int-to-vector-bool-zext.ll | 103 +++++++----------- .../CodeGen/X86/bitcast-int-to-vector-bool.ll | 11 +- llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll | 18 +-- 4 files changed, 52 insertions(+), 92 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0162bb65afe3b..866a2a94a0bfe 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28932,6 +28932,7 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); @@ -28978,7 +28979,7 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, return SDValue(); // If the shift amount is out of range, return undef. - if (APIntShiftAmt.uge(VT.getScalarSizeInBits())) + if (APIntShiftAmt.uge(EltSizeInBits)) return DAG.getUNDEF(VT); uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); @@ -29006,6 +29007,15 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, Op.getOpcode() == ISD::SRA) return ArithmeticShiftRight64(ShiftAmt); + // If we're logical shifting an all-signbits value then we can just perform as + // a mask. + if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) && + DAG.ComputeNumSignBits(R) == EltSizeInBits) { + SDValue Mask = DAG.getAllOnesConstant(dl, VT); + Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt); + return DAG.getNode(ISD::AND, dl, VT, R, Mask); + } + if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) || (Subtarget.hasBWI() && VT == MVT::v64i8)) { unsigned NumElts = VT.getVectorNumElements(); diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index ee39b1333fff3..d2794df731b65 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -180,7 +180,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $7, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; @@ -191,7 +190,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $7, %xmm0 ; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; @@ -203,7 +201,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -214,7 +211,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -268,11 +264,10 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8] ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i4_4i64: @@ -328,11 +323,10 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i8_8i32: @@ -390,11 +384,10 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i16_16i16: @@ -436,14 +429,12 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) { ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: psrlw $7, %xmm0 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: psrlw $7, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: retq ; @@ -460,13 +451,9 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i32_32i8: @@ -477,7 +464,6 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -550,19 +536,18 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2 ; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,1,1,1] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [16,32,64,128] +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i8_8i64: @@ -631,19 +616,18 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [256,512,1024,2048,4096,8192,16384,32768] +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i16_16i32: @@ -712,23 +696,22 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768] -; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i32_32i16: @@ -782,26 +765,22 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: psrlw $7, %xmm0 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,2,3,3,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: psrlw $7, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,5,5] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: psrlw $7, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 ; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,7,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: psrlw $7, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: retq ; @@ -817,26 +796,20 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i64_64i8: @@ -847,13 +820,11 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 79d8e4acbba5a..c27b5b289e1fa 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -185,7 +185,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) { ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $7, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; @@ -196,7 +195,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) { ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $7, %xmm0 ; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; @@ -208,7 +206,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) { ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -219,7 +216,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -252,13 +248,9 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: bitcast_i32_32i1: @@ -269,7 +261,6 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll index 3887d9547fd06..bc546fe857a3e 100644 --- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll +++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll @@ -623,28 +623,24 @@ define <16 x i8> @shl_s3_cmp_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-NOBMI-LABEL: shl_s3_cmp_v16i8: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: pcmpeqb %xmm1, %xmm0 -; CHECK-NOBMI-NEXT: psllw $3, %xmm0 ; CHECK-NOBMI-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI2-SSE2-LABEL: shl_s3_cmp_v16i8: ; CHECK-BMI2-SSE2: # %bb.0: ; CHECK-BMI2-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; CHECK-BMI2-SSE2-NEXT: psllw $3, %xmm0 ; CHECK-BMI2-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-BMI2-SSE2-NEXT: retq ; ; CHECK-AVX12-LABEL: shl_s3_cmp_v16i8: ; CHECK-AVX12: # %bb.0: ; CHECK-AVX12-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; CHECK-AVX12-NEXT: vpsllw $3, %xmm0, %xmm0 ; CHECK-AVX12-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX12-NEXT: retq ; ; CHECK-AVX512-LABEL: shl_s3_cmp_v16i8: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: retq %cmp = icmp eq <16 x i8> %x, %y @@ -673,10 +669,7 @@ define <4 x i64> @shl_s31_cmp_v4f64(<4 x double> %x, <4 x double> %y) { ; CHECK-AVX1-LABEL: shl_s31_cmp_v4f64: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 -; CHECK-AVX1-NEXT: vpsllq $31, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-AVX1-NEXT: vpsllq $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: shl_s31_cmp_v4f64: @@ -700,28 +693,24 @@ define <16 x i8> @shr_s1_cmp_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-NOBMI-LABEL: shr_s1_cmp_v16i8: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: pcmpeqb %xmm1, %xmm0 -; CHECK-NOBMI-NEXT: psrlw $1, %xmm0 ; CHECK-NOBMI-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI2-SSE2-LABEL: shr_s1_cmp_v16i8: ; CHECK-BMI2-SSE2: # %bb.0: ; CHECK-BMI2-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; CHECK-BMI2-SSE2-NEXT: psrlw $1, %xmm0 ; CHECK-BMI2-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-BMI2-SSE2-NEXT: retq ; ; CHECK-AVX12-LABEL: shr_s1_cmp_v16i8: ; CHECK-AVX12: # %bb.0: ; CHECK-AVX12-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; CHECK-AVX12-NEXT: vpsrlw $1, %xmm0, %xmm0 ; CHECK-AVX12-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX12-NEXT: retq ; ; CHECK-AVX512-LABEL: shr_s1_cmp_v16i8: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: retq %cmp = icmp eq <16 x i8> %x, %y @@ -753,9 +742,8 @@ define <8 x i32> @shr_s9_cmp_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; CHECK-AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $9, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $9, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: shr_s9_cmp_v8i32: