diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 35f756ea5e1d8..aed1a8e5cfd5a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23391,6 +23391,136 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } } + // We get bad codegen for v8i32 compares on avx targets (without avx2) so if + // possible convert to a v8f32 compare. + if (VTOp0 == MVT::v8i32 && Subtarget.hasAVX() && !Subtarget.hasAVX2()) { + std::optional KnownOps[2]; + // Check if an op is known to be in a certain range. + auto OpInRange = [&DAG, Op, &KnownOps](unsigned OpNo, bool CmpLT, + const APInt Bound) { + if (!KnownOps[OpNo].has_value()) + KnownOps[OpNo] = DAG.computeKnownBits(Op.getOperand(OpNo)); + + if (KnownOps[OpNo]->isUnknown()) + return false; + + std::optional Res; + if (CmpLT) + Res = KnownBits::ult(*KnownOps[OpNo], KnownBits::makeConstant(Bound)); + else + Res = KnownBits::ugt(*KnownOps[OpNo], KnownBits::makeConstant(Bound)); + return Res.value_or(false); + }; + + bool OkayCvt = false; + bool OkayBitcast = false; + + const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(MVT::f32); + + // For cvt up to 1 << (Significand Precision), (1 << 24 for ieee float) + const APInt MaxConvertableCvt = + APInt::getOneBitSet(32, APFloat::semanticsPrecision(Sem)); + // For bitcast up to (and including) first inf representation (0x7f800000 + + // 1 for ieee float) + const APInt MaxConvertableBitcast = + APFloat::getInf(Sem).bitcastToAPInt() + 1; + // For bitcast we also exclude de-norm values. This is absolutely necessary + // for strict semantic correctness, but DAZ (de-norm as zero) will break if + // we don't have this check. + const APInt MinConvertableBitcast = + APFloat::getSmallestNormalized(Sem).bitcastToAPInt() - 1; + + assert( + MaxConvertableBitcast.getBitWidth() == 32 && + MaxConvertableCvt == (1U << 24) && + MaxConvertableBitcast == 0x7f800001 && + MinConvertableBitcast.isNonNegative() && + MaxConvertableBitcast.sgt(MinConvertableBitcast) && + "This transform has only been verified to IEEE Single Precision Float"); + + // For bitcast we need both lhs/op1 u< MaxConvertableBitcast + // NB: It might be worth it to enable to bitcast version for unsigned avx2 + // comparisons as they typically require multiple instructions to lower + // (they don't fit `vpcmpeq`/`vpcmpgt` well). + if (OpInRange(1, /*CmpLT*/ true, MaxConvertableBitcast) && + OpInRange(1, /*CmpLT*/ false, MinConvertableBitcast) && + OpInRange(0, /*CmpLT*/ true, MaxConvertableBitcast) && + OpInRange(0, /*CmpLT*/ false, MinConvertableBitcast)) { + OkayBitcast = true; + } + // We want to convert icmp -> fcmp using `sitofp` iff one of the converts + // will be constant folded. + else if ((DAG.isConstantValueOfAnyType(peekThroughBitcasts(Op1)) || + DAG.isConstantValueOfAnyType(peekThroughBitcasts(Op0)))) { + if (isUnsignedIntSetCC(Cond)) { + // For cvt + unsigned compare we need both lhs/rhs >= 0 and either lhs + // or rhs < MaxConvertableCvt + + if (OpInRange(1, /*CmpLT*/ true, APInt::getSignedMinValue(32)) && + OpInRange(0, /*CmpLT*/ true, APInt::getSignedMinValue(32)) && + (OpInRange(1, /*CmpLT*/ true, MaxConvertableCvt) || + OpInRange(0, /*CmpLT*/ true, MaxConvertableCvt))) + OkayCvt = true; + } else { + // For cvt + signed compare we need abs(lhs) or abs(rhs) < + // MaxConvertableCvt + if (OpInRange(1, /*CmpLT*/ true, MaxConvertableCvt) || + OpInRange(1, /*CmpLT*/ false, -MaxConvertableCvt) || + OpInRange(0, /*CmpLT*/ true, MaxConvertableCvt) || + OpInRange(0, /*CmpLT*/ false, -MaxConvertableCvt)) + OkayCvt = true; + } + } + // TODO: If we can't prove any of the ranges, we could unconditionally lower + // `(icmp eq lhs, rhs)` as `(icmp eq (int_to_fp (xor lhs, rhs)), zero)` + if (OkayBitcast || OkayCvt) { + switch (Cond) { + default: + llvm_unreachable("Unexpected SETCC condition"); + // Get the new FP condition. Note for the unsigned conditions we have + // verified its okay to convert to the signed version. + case ISD::SETULT: + case ISD::SETLT: + Cond = ISD::SETOLT; + break; + case ISD::SETUGT: + case ISD::SETGT: + Cond = ISD::SETOGT; + break; + case ISD::SETULE: + case ISD::SETLE: + Cond = ISD::SETOLE; + break; + case ISD::SETUGE: + case ISD::SETGE: + Cond = ISD::SETOGE; + break; + case ISD::SETEQ: + Cond = ISD::SETOEQ; + break; + case ISD::SETNE: + Cond = ISD::SETONE; + break; + } + + MVT FpVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + SDNodeFlags Flags; + Flags.setNoNaNs(true); + Flags.setNoInfs(true); + Flags.setNoSignedZeros(true); + if (OkayBitcast) { + Op0 = DAG.getBitcast(FpVT, Op0); + Op1 = DAG.getBitcast(FpVT, Op1); + } else { + Op0 = DAG.getNode(ISD::SINT_TO_FP, dl, FpVT, Op0); + Op1 = DAG.getNode(ISD::SINT_TO_FP, dl, FpVT, Op1); + } + Op0->setFlags(Flags); + Op1->setFlags(Flags); + return DAG.getSetCC(dl, VT, Op0, Op1, Cond); + } + } + // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl); @@ -41216,6 +41346,156 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Simplify a decomposed (sext (setcc)). Assumes prior check that +// bitwidth(sext)==bitwidth(setcc operands). +static SDValue simplifySExtOfDecomposedSetCCImpl( + SelectionDAG &DAG, const SDLoc &DL, ISD::CondCode CC, SDValue Op0, + SDValue Op1, const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, bool AllowNOT, unsigned Depth) { + // Possible TODO: We could handle any power of two demanded bit + unsigned + // comparison. There are no x86 specific comparisons that are unsigned so its + // unneeded. + if (!OriginalDemandedBits.isSignMask()) + return SDValue(); + + EVT OpVT = Op0.getValueType(); + // We need need nofpclass(nan inf nzero) to handle floats. + auto hasOkayFPFlags = [](SDValue Op) { + return Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP || + (Op->getFlags().hasNoNaNs() && Op->getFlags().hasNoInfs() && + Op->getFlags().hasNoSignedZeros()); + }; + + if (OpVT.isFloatingPoint() && !hasOkayFPFlags(Op0)) + return SDValue(); + + auto ValsEq = [OpVT](const APInt &V0, APInt V1) -> bool { + if (OpVT.isFloatingPoint()) { + const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(OpVT); + return V0.eq(APFloat(Sem, V1).bitcastToAPInt()); + } + return V0.eq(V1); + }; + + // Assume we canonicalized constants to Op1. That isn't always true but we + // call this function twice with inverted CC/Operands so its fine either way. + APInt Op1C; + unsigned ValWidth = OriginalDemandedBits.getBitWidth(); + if (ISD::isConstantSplatVectorAllZeros(Op1.getNode())) { + Op1C = APInt::getZero(ValWidth); + } else if (ISD::isConstantSplatVectorAllOnes(Op1.getNode())) { + Op1C = APInt::getAllOnes(ValWidth); + } else if (auto *C = dyn_cast(Op1)) { + Op1C = C->getValueAPF().bitcastToAPInt(); + } else if (auto *C = dyn_cast(Op1)) { + Op1C = C->getAPIntValue(); + } else if (ISD::isConstantSplatVector(Op1.getNode(), Op1C)) { + // isConstantSplatVector sets `Op1C`. + } else { + return SDValue(); + } + + bool Not = false; + bool Okay = false; + assert(OriginalDemandedBits.getBitWidth() == Op1C.getBitWidth() && + "Invalid constant operand"); + + switch (CC) { + case ISD::SETGE: + case ISD::SETOGE: + Not = true; + [[fallthrough]]; + case ISD::SETLT: + case ISD::SETOLT: + // signbit(sext(x s< 0)) == signbit(x) + // signbit(sext(x s>= 0)) == signbit(~x) + Okay = ValsEq(Op1C, APInt::getZero(ValWidth)); + // For float ops we need to ensure Op0 is de-norm. Otherwise DAZ can break + // this fold. + // NB: We only need de-norm check here, for the rest of the constants any + // relationship with a de-norm value and zero will be identical. + if (Okay && OpVT.isFloatingPoint()) { + // Values from integers are always normal. + if (Op0.getOpcode() == ISD::SINT_TO_FP || + Op0.getOpcode() == ISD::UINT_TO_FP) + break; + + // See if we can prove normal with known bits. + KnownBits Op0Known = + DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth); + // Negative/positive doesn't matter. + Op0Known.One.clearSignBit(); + Op0Known.Zero.clearSignBit(); + + // Get min normal value. + const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(OpVT); + KnownBits MinNormal = KnownBits::makeConstant( + APFloat::getSmallestNormalized(Sem).bitcastToAPInt()); + // Are we above de-norm range? + std::optional Op0Normal = KnownBits::uge(Op0Known, MinNormal); + Okay = Op0Normal.value_or(false); + } + break; + case ISD::SETGT: + case ISD::SETOGT: + Not = true; + [[fallthrough]]; + case ISD::SETLE: + case ISD::SETOLE: + // signbit(sext(x s<= -1)) == signbit(x) + // signbit(sext(x s> -1)) == signbit(~x) + Okay = ValsEq(Op1C, APInt::getAllOnes(ValWidth)); + break; + case ISD::SETULT: + Not = true; + [[fallthrough]]; + case ISD::SETUGE: + // signbit(sext(x u>= SIGNED_MIN)) == signbit(x) + // signbit(sext(x u< SIGNED_MIN)) == signbit(~x) + Okay = !OpVT.isFloatingPoint() && ValsEq(Op1C, OriginalDemandedBits); + break; + case ISD::SETULE: + Not = true; + [[fallthrough]]; + case ISD::SETUGT: + // signbit(sext(x u> SIGNED_MAX)) == signbit(x) + // signbit(sext(x u<= SIGNED_MAX)) == signbit(~x) + Okay = !OpVT.isFloatingPoint() && ValsEq(Op1C, OriginalDemandedBits - 1); + break; + default: + break; + } + + Okay &= Not ? AllowNOT : true; + if (!Okay) + return SDValue(); + + if (!Not) + return Op0; + + if (!OpVT.isFloatingPoint()) + return DAG.getNOT(DL, Op0, OpVT); + + // Possible TODO: We could use `fneg` to do not. + return SDValue(); +} + +static SDValue simplifySExtOfDecomposedSetCC(SelectionDAG &DAG, const SDLoc &DL, + ISD::CondCode CC, SDValue Op0, + SDValue Op1, + const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, + bool AllowNOT, unsigned Depth) { + if (SDValue R = simplifySExtOfDecomposedSetCCImpl( + DAG, DL, CC, Op0, Op1, OriginalDemandedBits, OriginalDemandedElts, + AllowNOT, Depth)) + return R; + return simplifySExtOfDecomposedSetCCImpl( + DAG, DL, ISD::getSetCCSwappedOperands(CC), Op1, Op0, OriginalDemandedBits, + OriginalDemandedElts, AllowNOT, Depth); +} + // Simplify variable target shuffle masks based on the demanded elements. // TODO: Handle DemandedBits in mask indices as well? bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle( @@ -42395,13 +42675,26 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } break; } - case X86ISD::PCMPGT: - // icmp sgt(0, R) == ashr(R, BitWidth-1). - // iff we only need the sign bit then we can use R directly. - if (OriginalDemandedBits.isSignMask() && - ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) - return TLO.CombineTo(Op, Op.getOperand(1)); + case X86ISD::PCMPGT: { + SDLoc DL(Op); + if (SDValue R = simplifySExtOfDecomposedSetCC( + TLO.DAG, DL, ISD::SETGT, Op.getOperand(0), Op.getOperand(1), + OriginalDemandedBits, OriginalDemandedElts, + /*AllowNOT*/ true, Depth)) + return TLO.CombineTo(Op, R); + break; + } + case X86ISD::CMPP: { + SDLoc DL(Op); + ISD::CondCode CC = X86::getCondForCMPPImm( + cast(Op.getOperand(2))->getZExtValue()); + if (SDValue R = simplifySExtOfDecomposedSetCC( + TLO.DAG, DL, CC, Op.getOperand(0), Op.getOperand(1), + OriginalDemandedBits, OriginalDemandedElts, + !(TLO.LegalOperations() && TLO.LegalTypes()), Depth)) + return TLO.CombineTo(Op, R); break; + } case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -42585,13 +42878,25 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( if (DemandedBits.isSignMask()) return Op.getOperand(0); break; - case X86ISD::PCMPGT: - // icmp sgt(0, R) == ashr(R, BitWidth-1). - // iff we only need the sign bit then we can use R directly. - if (DemandedBits.isSignMask() && - ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) - return Op.getOperand(1); + case X86ISD::PCMPGT: { + SDLoc DL(Op); + if (SDValue R = simplifySExtOfDecomposedSetCC( + DAG, DL, ISD::SETGT, Op.getOperand(0), Op.getOperand(1), + DemandedBits, DemandedElts, /*AllowNOT*/ false, Depth)) + return R; + break; + } + case X86ISD::CMPP: { + SDLoc DL(Op); + ISD::CondCode CC = X86::getCondForCMPPImm( + cast(Op.getOperand(2))->getZExtValue()); + if (SDValue R = simplifySExtOfDecomposedSetCC(DAG, DL, CC, Op.getOperand(0), + Op.getOperand(1), + DemandedBits, DemandedElts, + /*AllowNOT*/ false, Depth)) + return R; break; + } case X86ISD::BLENDV: { // BLENDV: Cond (MSB) ? LHS : RHS SDValue Cond = Op.getOperand(0); @@ -48267,7 +48572,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, // We do not split for SSE at all, but we need to split vectors for AVX1 and // AVX2. - if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && + if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) { SDValue LoX, HiX; std::tie(LoX, HiX) = splitVector(X, DAG, DL); diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index eb42a4b2119d5..c5ca14b672650 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3360,6 +3360,46 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) { } } +ISD::CondCode X86::getCondForCMPPImm(unsigned Imm) { + assert(Imm <= 0x1f && "Invalid CMPP Imm"); + switch (Imm & 0xf) { + default: + llvm_unreachable("Invalid CMPP Imm"); + case 0: + return ISD::SETOEQ; + case 1: + return ISD::SETOLT; + case 2: + return ISD::SETOLE; + case 3: + return ISD::SETUO; + case 4: + return ISD::SETUNE; + case 5: + return ISD::SETUGE; + case 6: + return ISD::SETUGT; + case 7: + return ISD::SETO; + case 8: + return ISD::SETUEQ; + case 9: + return ISD::SETULT; + case 10: + return ISD::SETULE; + case 11: + return ISD::SETFALSE; + case 12: + return ISD::SETONE; + case 13: + return ISD::SETOGE; + case 14: + return ISD::SETOGT; + case 15: + return ISD::SETTRUE; + } +} + /// Get the VPCMP immediate if the operands are swapped. unsigned X86::getSwappedVPCMPImm(unsigned Imm) { switch (Imm) { diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index e719be0caf3ee..f40e0f6a44784 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -72,6 +72,9 @@ CondCode GetOppositeBranchCondition(CondCode CC); /// Get the VPCMP immediate for the given condition. unsigned getVPCMPImmForCond(ISD::CondCode CC); +/// Get the CondCode from a CMPP immediate. +ISD::CondCode getCondForCMPPImm(unsigned Imm); + /// Get the VPCMP immediate if the opcodes are swapped. unsigned getSwappedVPCMPImm(unsigned Imm); diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index 6255621d870e1..eef2b3db5d694 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -256,12 +256,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i8_8i32: @@ -487,18 +484,12 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i16_16i32: diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index d2794df731b65..5c810797bd2b7 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -320,12 +320,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -613,20 +610,14 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [256,512,1024,2048,4096,8192,16384,32768] -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/cmpf-avx.ll b/llvm/test/CodeGen/X86/cmpf-avx.ll new file mode 100644 index 0000000000000..15b909350b267 --- /dev/null +++ b/llvm/test/CodeGen/X86/cmpf-avx.ll @@ -0,0 +1,250 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64 + +define <8 x i32> @cmp_eq_bitcast(<8 x i32> %x) { +; X86-LABEL: cmp_eq_bitcast: +; X86: # %bb.0: +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X86-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_eq_bitcast: +; X64: # %bb.0: +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X64-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq + %and = and <8 x i32> %x, + %cmp = icmp eq <8 x i32> %and, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_ne_sitofp(<8 x i32> %x) { +; X86-LABEL: cmp_ne_sitofp: +; X86: # %bb.0: +; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X86-NEXT: vcmpneq_oqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_ne_sitofp: +; X64: # %bb.0: +; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X64-NEXT: vcmpneq_oqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq + %cmp = icmp ne <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_slt_fail_no_const(<8 x i32> %x, <8 x i32> %y) { +; X86-LABEL: cmp_slt_fail_no_const: +; X86: # %bb.0: +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_slt_fail_no_const: +; X64: # %bb.0: +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-NEXT: retq + %and = and <8 x i32> %x, + %cmp = icmp slt <8 x i32> %and, %y + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_eq_sitofp(<8 x i32> %x) { +; X86-LABEL: cmp_eq_sitofp: +; X86: # %bb.0: +; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X86-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_eq_sitofp: +; X64: # %bb.0: +; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X64-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq + %cmp = icmp eq <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_sgt_fail_no_bounds(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: cmp_sgt_fail_no_bounds: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %cmp = icmp slt <8 x i32> %x, %y + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_sgt_bitcast(<8 x i32> %xx, <8 x i32> %yy) { +; CHECK-LABEL: cmp_sgt_bitcast: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040] +; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vandps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = and <8 x i32> %xx, + %y = and <8 x i32> %yy, + + %cmp = icmp sgt <8 x i32> %x, %y + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_sle_fail_out_of_bounds(<8 x i32> %xx) { +; X86-LABEL: cmp_sle_fail_out_of_bounds: +; X86: # %bb.0: +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [2139095041,2139095041,2139095041,2139095041] +; X86-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 +; X86-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_sle_fail_out_of_bounds: +; X64: # %bb.0: +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [2139095041,2139095041,2139095041,2139095041] +; X64-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 +; X64-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: retq + %x = and <8 x i32> %xx, + %cmp = icmp sle <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_eq_fail_out_of_bounds(<8 x i32> %x) { +; CHECK-LABEL: cmp_eq_fail_out_of_bounds: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777216,16777216,16777216,16777216] +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %cmp = icmp eq <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_eq_fail_out_of_bounds2(<8 x i32> %x) { +; CHECK-LABEL: cmp_eq_fail_out_of_bounds2: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [4278190080,4278190080,4278190080,4278190080] +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %cmp = icmp eq <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_eq_todo(<8 x i32> %x) { +; X86-LABEL: cmp_eq_todo: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_eq_todo: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: retq + %cmp = icmp eq <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_ult_fail_maybe_negative(<8 x i32> %x) { +; CHECK-LABEL: cmp_ult_fail_maybe_negative: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2] +; CHECK-NEXT: vpminud %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: vpminud %xmm2, %xmm0, %xmm2 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %cmp = icmp ult <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_ule_bitcast(<8 x i32> %xx) { +; X86-LABEL: cmp_ule_bitcast: +; X86: # %bb.0: +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_ule_bitcast: +; X64: # %bb.0: +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: retq + %x = and <8 x i32> %xx, + %cmp = icmp ule <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} + +define <8 x i32> @cmp_ugt_sitofp(<8 x i32> %xx) { +; X86-LABEL: cmp_ugt_sitofp: +; X86: # %bb.0: +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; X86-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: cmp_ugt_sitofp: +; X64: # %bb.0: +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq + %x = and <8 x i32> %xx, + %cmp = icmp ugt <8 x i32> %x, + %sext = sext <8 x i1> %cmp to <8 x i32> + ret <8 x i32> %sext +} diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll index 43dddbecf51a7..f5ed2853d41aa 100644 --- a/llvm/test/CodeGen/X86/combine-testps.ll +++ b/llvm/test/CodeGen/X86/combine-testps.ll @@ -171,13 +171,22 @@ define i32 @testpsz_128_signbit(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b) } define i32 @testpsnzc_256_signbit(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b) { -; CHECK-LABEL: testpsnzc_256_signbit: -; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vtestps %ymm1, %ymm0 -; CHECK-NEXT: cmovnel %esi, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; AVX-LABEL: testpsnzc_256_signbit: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vtestps %ymm1, %ymm0 +; AVX-NEXT: cmovnel %esi, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: testpsnzc_256_signbit: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: vtestps %ymm1, %ymm0 +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %t0 = bitcast <8 x float> %c to <8 x i32> %t1 = icmp sgt <8 x i32> zeroinitializer, %t0 %t2 = sext <8 x i1> %t1 to <8 x i32> diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index 3187bf6448690..0ab572a50ed3d 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1844,25 +1844,25 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; ; AVX1-LABEL: compressstore_v32f32_v32i32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8 -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5 +; AVX1-NEXT: vxorps %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9 +; AVX1-NEXT: vpackssdw %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4 +; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9 +; AVX1-NEXT: vpackssdw %xmm9, %xmm4, %xmm4 ; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpmovmskb %xmm4, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm9, %xmm5 -; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm7, %ymm4 +; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vcvtdq2ps %ymm6, %ymm5 +; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index 4c5b67962a58b..2a010e78b41ed 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -691,14 +691,14 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, < ; ; AVX1-LABEL: expandload_v16f64_v16i32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5 +; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vcmpeqps %ymm6, %ymm5, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpackssdw %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4 +; AVX1-NEXT: vcmpeqps %ymm6, %ymm4, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpmovmskb %xmm4, %eax @@ -1989,25 +1989,25 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32 ; ; AVX1-LABEL: expandload_v32f32_v32i32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8 -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5 +; AVX1-NEXT: vxorps %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9 +; AVX1-NEXT: vpackssdw %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4 +; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9 +; AVX1-NEXT: vpackssdw %xmm9, %xmm4, %xmm4 ; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpmovmskb %xmm4, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm9, %xmm5 -; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm7, %ymm4 +; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vcvtdq2ps %ymm6, %ymm5 +; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll index 559a7ec0930b9..ff311ea67645d 100644 --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1328,14 +1328,12 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; ; AVX1-LABEL: gather_v8i32_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovmskps %ymm1, %eax +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm1 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: # implicit-def: $ymm0 ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -1359,16 +1357,14 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_16 ; AVX1-NEXT: .LBB4_15: # %cond.load19 -; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX1-NEXT: .LBB4_16: # %else20 -; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vmovmskps %ymm3, %eax +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vmovmskps %ymm2, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: # implicit-def: $ymm3 +; AVX1-NEXT: # implicit-def: $ymm2 ; AVX1-NEXT: jne .LBB4_17 ; AVX1-NEXT: # %bb.18: # %else26 ; AVX1-NEXT: testb $2, %al @@ -1392,16 +1388,14 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_32 ; AVX1-NEXT: .LBB4_31: # %cond.load58 -; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX1-NEXT: .LBB4_32: # %else61 -; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpeqps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovmskps %ymm1, %eax ; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: # implicit-def: $ymm0 +; AVX1-NEXT: # implicit-def: $ymm1 ; AVX1-NEXT: jne .LBB4_33 ; AVX1-NEXT: # %bb.34: # %else67 ; AVX1-NEXT: testb $2, %al @@ -1416,125 +1410,125 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_42 ; AVX1-NEXT: .LBB4_41: # %cond.load84 -; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-NEXT: .LBB4_42: # %else87 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_44 ; AVX1-NEXT: # %bb.43: # %cond.load89 -; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX1-NEXT: .LBB4_44: # %else92 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm2 ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_46 ; AVX1-NEXT: # %bb.45: # %cond.load94 ; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7] ; AVX1-NEXT: .LBB4_46: # %else97 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_48 ; AVX1-NEXT: # %bb.47: # %cond.load99 ; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX1-NEXT: .LBB4_48: # %else102 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB4_1: # %cond.load -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_4 ; AVX1-NEXT: .LBB4_3: # %cond.load1 -; AVX1-NEXT: vpinsrd $1, c+12(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpinsrd $1, c+12(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB4_6 ; AVX1-NEXT: .LBB4_5: # %cond.load4 -; AVX1-NEXT: vpinsrd $2, c+12(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpinsrd $2, c+12(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB4_8 ; AVX1-NEXT: .LBB4_7: # %cond.load7 -; AVX1-NEXT: vpinsrd $3, c+12(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpinsrd $3, c+12(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_10 ; AVX1-NEXT: .LBB4_9: # %cond.load10 -; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7] +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_12 ; AVX1-NEXT: .LBB4_11: # %cond.load13 -; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_14 ; AVX1-NEXT: .LBB4_13: # %cond.load16 -; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7] +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: jne .LBB4_15 ; AVX1-NEXT: jmp .LBB4_16 ; AVX1-NEXT: .LBB4_17: # %cond.load23 -; AVX1-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_20 ; AVX1-NEXT: .LBB4_19: # %cond.load28 -; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB4_22 ; AVX1-NEXT: .LBB4_21: # %cond.load33 -; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB4_24 ; AVX1-NEXT: .LBB4_23: # %cond.load38 -; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm3, %xmm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_26 ; AVX1-NEXT: .LBB4_25: # %cond.load43 -; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_28 ; AVX1-NEXT: .LBB4_27: # %cond.load48 -; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_30 ; AVX1-NEXT: .LBB4_29: # %cond.load53 -; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: jne .LBB4_31 ; AVX1-NEXT: jmp .LBB4_32 ; AVX1-NEXT: .LBB4_33: # %cond.load64 -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_36 ; AVX1-NEXT: .LBB4_35: # %cond.load69 -; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB4_38 ; AVX1-NEXT: .LBB4_37: # %cond.load74 -; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB4_40 ; AVX1-NEXT: .LBB4_39: # %cond.load79 -; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: jne .LBB4_41 ; AVX1-NEXT: jmp .LBB4_42 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 89459a2d10177..d99927ef85052 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -1442,11 +1442,9 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x float> ; ; AVX1-LABEL: load_v8f32_v8i32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 898b34e969b1d..be27475b65b79 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -1509,11 +1509,9 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) { ; ; AVX1-LABEL: store_v8i32_v8i32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -6127,37 +6125,42 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; ; AVX1-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: ; AVX1: ## %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %ymm0 -; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX1-NEXT: vmovaps 64(%rsi), %ymm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd 48(%rdi), %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtd 32(%rdi), %xmm3, %xmm5 -; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm5 -; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm8 -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3],xmm8[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $31, %xmm8, %xmm8 -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3],xmm7[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $31, %xmm7, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-NEXT: vmaskmovps %ymm0, %ymm7, (%rdx) -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7] +; AVX1-NEXT: vcvtdq2ps 32(%rdi), %ymm3 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcmpltps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vcvtdq2ps (%rdi), %ymm5 +; AVX1-NEXT: vcmpltps %ymm4, %ymm5, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpacksswb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vcvtdq2ps 64(%rdi), %ymm5 +; AVX1-NEXT: vcmpltps %ymm4, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2],xmm7[3] +; AVX1-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3] ; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, 64(%rdx) -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vmaskmovps %ymm2, %ymm4, 64(%rdx) +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdx) +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; AVX1-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vmaskmovps %ymm1, %ymm2, (%rdx) +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 32(%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index f4a0207dafde7..7d1687e373368 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -151,14 +151,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6] @@ -372,13 +367,10 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -757,13 +749,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -2204,18 +2193,17 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmovmskb %xmm1, %eax -; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB9_1 ; AVX1-NEXT: # %bb.2: # %else @@ -2867,18 +2855,17 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmovmskb %xmm1, %eax -; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB10_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3421,13 +3408,10 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3795,13 +3779,10 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 487f7298f442c..bc557f75f02a2 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -281,14 +281,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647] ; AVX1-NEXT: # xmm4 = mem[0,0] @@ -688,13 +683,10 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1231,13 +1223,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3103,18 +3092,17 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmovmskb %xmm1, %eax -; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB9_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3759,18 +3747,17 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmovmskb %xmm1, %eax -; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB10_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4311,13 +4298,10 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4683,13 +4667,10 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 498f250f11c69..8215c52aabfbe 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -216,14 +216,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 @@ -554,13 +549,10 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1027,13 +1019,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -2741,18 +2730,17 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: vpminud %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmovmskb %xmm1, %eax -; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB9_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3435,18 +3423,17 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: vpminud %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmovmskb %xmm1, %eax -; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB10_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4011,13 +3998,10 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4403,13 +4387,10 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll index 98d193a79cb74..8754ae5716ae7 100644 --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1806,23 +1806,19 @@ define <16 x i32> @test_masked_v16i32(ptr %addr, <16 x i32> %old, <16 x i32> %ma ; ; AVX1-LABEL: test_masked_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcmpneq_oqps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vcmpneq_oqps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm4 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_masked_v16i32: diff --git a/llvm/test/CodeGen/X86/pr48215.ll b/llvm/test/CodeGen/X86/pr48215.ll index 8843a0410a9f7..7ccdcd60b0d10 100644 --- a/llvm/test/CodeGen/X86/pr48215.ll +++ b/llvm/test/CodeGen/X86/pr48215.ll @@ -10,18 +10,17 @@ define i32 @PR48215(i32 %a0, i32 %a1) { ; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: cltd ; AVX1-NEXT: idivl %esi -; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,1,2,3] -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vmovmskps %ymm2, %ecx -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %ecx +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: addl %ecx, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index b2b242fa29818..790cdf07c218e 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -861,6 +861,7 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm1 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 @@ -1059,6 +1060,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 @@ -1074,6 +1076,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3 ; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vpsrad $31, %xmm6, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll index 90e5c279d2e17..30995bd8c523b 100644 --- a/llvm/test/CodeGen/X86/setcc-lowering.ll +++ b/llvm/test/CodeGen/X86/setcc-lowering.ll @@ -10,10 +10,11 @@ define <8 x i16> @pr25080(<8 x i32> %a) { ; AVX1-LABEL: pr25080: ; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 64aead7041575..f3a19505c92db 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -898,23 +898,22 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX1-LABEL: v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm1 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm5, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: @@ -1111,41 +1110,39 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX1-LABEL: v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vcmpltps %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm8 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm7, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm5, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3 +; AVX1-NEXT: vcmpltps %ymm3, %ymm5, %ymm3 +; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vpsrad $31, %xmm7, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm5, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm8, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i32: diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll index 67b7eb48e4cb3..d16202a98bb8c 100644 --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -180,21 +180,17 @@ define void @neg_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp { define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) { ; X86-LABEL: and_mask_constant: ; X86: ## %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; X86-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: and_mask_constant: ; X64: ## %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index cee30f5fe5da9..233f2f4aec9f9 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -308,19 +308,18 @@ define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX1-LABEL: saddo_v6i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm4, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: saddo_v6i32: @@ -376,19 +375,18 @@ define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX1-LABEL: saddo_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm4, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: saddo_v8i32: @@ -452,41 +450,43 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX1-LABEL: saddo_v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm6 +; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vcmpltps %ymm7, %ymm6, %ymm6 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-NEXT: vpxor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 -; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vxorps %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm8 +; AVX1-NEXT: vcmpltps %ymm7, %ymm8, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 64ed081048851..88e340d85735f 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -311,19 +311,18 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX1-LABEL: ssubo_v6i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm4, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v6i32: @@ -380,19 +379,18 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX1-LABEL: ssubo_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm4, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v8i32: @@ -457,41 +455,43 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX1-LABEL: ssubo_v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm6 +; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vcmpltps %ymm6, %ymm7, %ymm6 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-NEXT: vpxor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 -; AVX1-NEXT: vpsubd %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vxorps %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm8 +; AVX1-NEXT: vcmpltps %ymm8, %ymm7, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 6311678924d06..9e19a25522aed 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -517,19 +517,16 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vcmpneq_oqps %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 ; AVX1-NEXT: vmovq %xmm1, 16(%rdi) @@ -689,19 +686,16 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vcmpneq_oqps %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) @@ -937,16 +931,18 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-NEXT: vcvtdq2ps %ymm6, %ymm6 ; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm8, %xmm9, %xmm8 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm9 -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5],xmm8[6,7] -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpackssdw %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vcmpneq_oqps %ymm7, %ymm6, %ymm6 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 @@ -955,18 +951,18 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX1-NEXT: vpmuludq %xmm8, %xmm10, %xmm11 ; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5],xmm9[6,7] -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm9, %xmm9 ; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq %xmm11, %xmm12, %xmm11 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm12 ; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5],xmm11[6,7] -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm11, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-NEXT: vcvtdq2ps %ymm9, %ymm9 +; AVX1-NEXT: vcmpneq_oqps %ymm7, %ymm9, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm9 ; AVX1-NEXT: vpackssdw %xmm9, %xmm7, %xmm7 ; AVX1-NEXT: vpacksswb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpmulld %xmm8, %xmm10, %xmm8 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 @@ -976,7 +972,6 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vpacksswb %xmm6, %xmm6, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 0adb9ddfc426a..b87a536e0fa25 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -7745,19 +7745,15 @@ define <16 x float> @vpaddd_mask_test(<16 x float> %i, <16 x float> %j, <16 x i3 ; ; AVX1-LABEL: vpaddd_mask_test: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5 +; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vcmpneq_oqps %ymm6, %ymm5, %ymm5 +; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4 +; AVX1-NEXT: vcmpneq_oqps %ymm6, %ymm4, %ymm4 ; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm3 ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm2 -; AVX1-NEXT: vblendvps %ymm4, %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vblendvps %ymm5, %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm4, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm5, %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: vpaddd_mask_test: diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll index 5b43acbe52375..1f2abc5073178 100644 --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -1048,11 +1048,9 @@ define <8 x i32> @is_positive_mask_v8i32(<8 x i32> %x, <8 x i32> %y) { ; ; AVX1-LABEL: is_positive_mask_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1310,11 +1308,9 @@ define <8 x i32> @is_positive_mask_load_v8i32(<8 x i32> %x, ptr %p) { ; ; AVX1-LABEL: is_positive_mask_load_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1664,10 +1660,10 @@ define <8 x i1> @is_positive_mask_v8i32_v8i1(<8 x i32> %x, <8 x i1> %y) { ; ; AVX1-LABEL: is_positive_mask_v8i32_v8i1: ; AVX1: # %bb.0: +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper @@ -1961,14 +1957,14 @@ define <4 x i64> @PR52504(<4 x i16> %t3) { ; SSE42-LABEL: PR52504: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE42-NEXT: pmovsxwq %xmm1, %xmm2 -; SSE42-NEXT: pmovsxwq %xmm0, %xmm3 -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE42-NEXT: por %xmm3, %xmm0 -; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 -; SSE42-NEXT: por %xmm2, %xmm1 +; SSE42-NEXT: pmovsxwq %xmm1, %xmm1 +; SSE42-NEXT: pmovsxwq %xmm0, %xmm2 +; SSE42-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE42-NEXT: movapd %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: PR52504: @@ -1976,20 +1972,17 @@ define <4 x i64> @PR52504(<4 x i16> %t3) { ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR52504: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR52504: diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll index 05854ff728a07..fad3eb33400ea 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -3304,16 +3304,13 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v8i32: @@ -3372,14 +3369,13 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_2_v8i32: @@ -3432,36 +3428,36 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_2_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v8i32: @@ -3535,36 +3531,35 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_3_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v8i32: @@ -3638,36 +3633,36 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_3_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v8i32: @@ -3741,36 +3736,35 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_4_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v8i32: @@ -3844,36 +3838,36 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_4_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v8i32: @@ -3947,36 +3941,35 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_5_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v8i32: @@ -4050,36 +4043,36 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_5_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_5_v8i32: @@ -4153,36 +4146,35 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { define <8 x i32> @ult_6_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_6_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_6_v8i32: @@ -4256,36 +4248,36 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_6_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_6_v8i32: @@ -4359,36 +4351,35 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { define <8 x i32> @ult_7_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_7_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_7_v8i32: @@ -4462,36 +4453,36 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_7_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_7_v8i32: @@ -4565,36 +4556,35 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { define <8 x i32> @ult_8_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_8_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_8_v8i32: @@ -4668,36 +4658,36 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_8_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_8_v8i32: @@ -4771,36 +4761,35 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { define <8 x i32> @ult_9_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_9_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_9_v8i32: @@ -4874,36 +4863,36 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_9_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_9_v8i32: @@ -4977,36 +4966,35 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { define <8 x i32> @ult_10_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_10_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_10_v8i32: @@ -5080,36 +5068,36 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_10_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_10_v8i32: @@ -5183,36 +5171,35 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { define <8 x i32> @ult_11_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_11_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_11_v8i32: @@ -5286,36 +5273,36 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_11_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_11_v8i32: @@ -5389,36 +5376,35 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { define <8 x i32> @ult_12_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_12_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_12_v8i32: @@ -5492,36 +5478,36 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_12_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_12_v8i32: @@ -5595,36 +5581,35 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { define <8 x i32> @ult_13_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_13_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_13_v8i32: @@ -5698,36 +5683,36 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_13_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_13_v8i32: @@ -5801,36 +5786,35 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { define <8 x i32> @ult_14_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_14_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_14_v8i32: @@ -5904,36 +5888,36 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_14_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_14_v8i32: @@ -6007,36 +5991,35 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { define <8 x i32> @ult_15_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_15_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_15_v8i32: @@ -6110,36 +6093,36 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_15_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_15_v8i32: @@ -6213,36 +6196,35 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { define <8 x i32> @ult_16_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_16_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_16_v8i32: @@ -6316,36 +6298,36 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_16_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_16_v8i32: @@ -6419,36 +6401,35 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { define <8 x i32> @ult_17_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_17_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_17_v8i32: @@ -6522,36 +6503,36 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_17_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_17_v8i32: @@ -6625,36 +6606,35 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { define <8 x i32> @ult_18_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_18_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_18_v8i32: @@ -6728,36 +6708,36 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_18_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_18_v8i32: @@ -6831,36 +6811,35 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { define <8 x i32> @ult_19_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_19_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_19_v8i32: @@ -6934,36 +6913,36 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_19_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_19_v8i32: @@ -7037,36 +7016,35 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { define <8 x i32> @ult_20_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_20_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_20_v8i32: @@ -7140,36 +7118,36 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_20_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_20_v8i32: @@ -7243,36 +7221,35 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { define <8 x i32> @ult_21_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_21_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_21_v8i32: @@ -7346,36 +7323,36 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_21_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_21_v8i32: @@ -7449,36 +7426,35 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { define <8 x i32> @ult_22_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_22_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_22_v8i32: @@ -7552,36 +7528,36 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_22_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_22_v8i32: @@ -7655,36 +7631,35 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { define <8 x i32> @ult_23_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_23_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_23_v8i32: @@ -7758,36 +7733,36 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_23_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_23_v8i32: @@ -7861,36 +7836,35 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { define <8 x i32> @ult_24_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_24_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_24_v8i32: @@ -7964,36 +7938,36 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_24_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_24_v8i32: @@ -8067,36 +8041,35 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { define <8 x i32> @ult_25_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_25_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_25_v8i32: @@ -8170,36 +8143,36 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_25_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_25_v8i32: @@ -8273,36 +8246,35 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { define <8 x i32> @ult_26_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_26_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_26_v8i32: @@ -8376,36 +8348,36 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_26_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_26_v8i32: @@ -8479,36 +8451,35 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { define <8 x i32> @ult_27_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_27_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_27_v8i32: @@ -8582,36 +8553,36 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_27_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_27_v8i32: @@ -8685,36 +8656,35 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { define <8 x i32> @ult_28_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_28_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_28_v8i32: @@ -8788,36 +8758,36 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_28_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_28_v8i32: @@ -8891,36 +8861,35 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { define <8 x i32> @ult_29_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_29_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_29_v8i32: @@ -8994,36 +8963,36 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_29_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_29_v8i32: @@ -9097,36 +9066,35 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { define <8 x i32> @ult_30_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_30_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_30_v8i32: @@ -9200,36 +9168,36 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_30_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_30_v8i32: @@ -9303,36 +9271,35 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { define <8 x i32> @ult_31_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_31_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [31,31,31,31] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_31_v8i32: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll index ec41657d2f248..47c8d0200b558 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VL @@ -816,63 +816,122 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: orps %xmm4, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v16f32: -; AVX: # %bb.0: -; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm0, %ymm2 -; AVX-NEXT: vblendvps %ymm0, %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmaxps %ymm2, %ymm0, %ymm1 -; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2 -; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: testl %eax, %eax -; AVX-NEXT: js .LBB4_1 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: vmovaps %xmm0, %xmm2 -; AVX-NEXT: jmp .LBB4_3 -; AVX-NEXT: .LBB4_1: -; AVX-NEXT: vmovaps %xmm1, %xmm2 -; AVX-NEXT: vmovaps %xmm0, %xmm1 -; AVX-NEXT: .LBB4_3: -; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3 -; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vmovd %xmm2, %eax -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: testl %eax, %eax -; AVX-NEXT: js .LBB4_4 -; AVX-NEXT: # %bb.5: -; AVX-NEXT: vmovaps %xmm2, %xmm3 -; AVX-NEXT: jmp .LBB4_6 -; AVX-NEXT: .LBB4_4: -; AVX-NEXT: vmovapd %xmm1, %xmm3 -; AVX-NEXT: vmovaps %xmm2, %xmm1 -; AVX-NEXT: .LBB4_6: -; AVX-NEXT: vmaxss %xmm3, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3 -; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovd %xmm1, %eax -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: testl %eax, %eax -; AVX-NEXT: js .LBB4_7 -; AVX-NEXT: # %bb.8: -; AVX-NEXT: vmovaps %xmm1, %xmm2 -; AVX-NEXT: jmp .LBB4_9 -; AVX-NEXT: .LBB4_7: -; AVX-NEXT: vmovaps %xmm0, %xmm2 -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: .LBB4_9: -; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v16f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm2 +; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm3 +; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmaxps %ymm3, %ymm0, %ymm1 +; AVX1-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2 +; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB4_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: jmp .LBB4_3 +; AVX1-NEXT: .LBB4_1: +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: .LBB4_3: +; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB4_4 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: vmovaps %xmm2, %xmm3 +; AVX1-NEXT: jmp .LBB4_6 +; AVX1-NEXT: .LBB4_4: +; AVX1-NEXT: vmovapd %xmm1, %xmm3 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: .LBB4_6: +; AVX1-NEXT: vmaxss %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB4_7 +; AVX1-NEXT: # %bb.8: +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: jmp .LBB4_9 +; AVX1-NEXT: .LBB4_7: +; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: .LBB4_9: +; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvps %ymm0, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmaxps %ymm2, %ymm0, %ymm1 +; AVX2-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmaxps %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: js .LBB4_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: vmovaps %xmm0, %xmm2 +; AVX2-NEXT: jmp .LBB4_3 +; AVX2-NEXT: .LBB4_1: +; AVX2-NEXT: vmovaps %xmm1, %xmm2 +; AVX2-NEXT: vmovaps %xmm0, %xmm1 +; AVX2-NEXT: .LBB4_3: +; AVX2-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3 +; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: js .LBB4_4 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: vmovaps %xmm2, %xmm3 +; AVX2-NEXT: jmp .LBB4_6 +; AVX2-NEXT: .LBB4_4: +; AVX2-NEXT: vmovapd %xmm1, %xmm3 +; AVX2-NEXT: vmovaps %xmm2, %xmm1 +; AVX2-NEXT: .LBB4_6: +; AVX2-NEXT: vmaxss %xmm3, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3 +; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: js .LBB4_7 +; AVX2-NEXT: # %bb.8: +; AVX2-NEXT: vmovaps %xmm1, %xmm2 +; AVX2-NEXT: jmp .LBB4_9 +; AVX2-NEXT: .LBB4_7: +; AVX2-NEXT: vmovaps %xmm0, %xmm2 +; AVX2-NEXT: vmovaps %xmm1, %xmm0 +; AVX2-NEXT: .LBB4_9: +; AVX2-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v16f32: ; AVX512BW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index f80544fdef7e6..1d272c8e44f58 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -950,12 +950,10 @@ define i1 @icmp0_v8i32_v8i1(<8 x i32>) nounwind { ; ; AVX1-LABEL: icmp0_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vtestps %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vtestps %ymm0, %ymm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1301,14 +1299,14 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; ; AVX1-LABEL: icmp0_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 80b4f4614383f..87cbf55fee30a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -1329,11 +1329,9 @@ define i1 @icmp0_v8i32_v8i1(<8 x i32>) nounwind { ; ; AVX1-LABEL: icmp0_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: setnp %al @@ -1721,14 +1719,14 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; ; AVX1-LABEL: icmp0_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 85c1e25c29ed5..dc9e69137a8a7 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2419,12 +2419,9 @@ define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) { ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_8i1_to_8i32: diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll index 9fd5b9010b0cf..0eb2221e333d2 100644 --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -679,15 +679,16 @@ define void @PR54171(ptr %mask0, ptr %mask1, i64 %i) { ; AVX1-NEXT: # %bb.1: # %if.then ; AVX1-NEXT: vmovd %edx, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, (%rdi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) -; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [0.0E+0,0.0E+0,1.0E+0,1.0E+0,2.0E+0,2.0E+0,3.0E+0,3.0E+0] +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,4.0E+0,5.0E+0,5.0E+0,6.0E+0,6.0E+0,7.0E+0,7.0E+0] +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) ; AVX1-NEXT: .LBB18_2: # %if.end +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR54171: