diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 586eb2f3cf45e..09c7476919e43 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23182,6 +23182,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { auto *IndexC = dyn_cast(EltNo); // Insert into out-of-bounds element is undefined. + // Code below relies on that we handle this special case early. if (IndexC && VT.isFixedLengthVector() && IndexC->getZExtValue() >= VT.getVectorNumElements()) return DAG.getUNDEF(VT); @@ -23192,14 +23193,28 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) return InVec; - if (!IndexC) { - // If this is variable insert to undef vector, it might be better to splat: - // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > - if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) - return DAG.getSplat(VT, DL, InVal); - return SDValue(); + // If this is variable insert to undef vector, it might be better to splat: + // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > + if (!IndexC && InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) + return DAG.getSplat(VT, DL, InVal); + + // Try to drop insert of UNDEF/POISON elements. This is also done in getNode, + // but we also do it as a DAG combine since for example simplifications into + // SPLAT_VECTOR/BUILD_VECTOR may turn poison elements into undef/zero etc, and + // then suddenly the InVec is guaranteed to not be poison. + if (InVal.isUndef()) { + if (IndexC && VT.isFixedLengthVector()) { + APInt EltMask = APInt::getOneBitSet(VT.getVectorNumElements(), + IndexC->getZExtValue()); + if (DAG.isGuaranteedNotToBePoison(InVec, EltMask)) + return InVec; + } + return DAG.getFreeze(InVec); } + if (!IndexC) + return SDValue(); + if (VT.isScalableVector()) return SDValue(); @@ -27639,18 +27654,42 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDValue N2 = N->getOperand(2); uint64_t InsIdx = N->getConstantOperandVal(2); - // If inserting an UNDEF, just return the original vector. - if (N1.isUndef()) - return N0; + // If inserting an UNDEF, just return the original vector (unless it makes the + // result more poisonous). + if (N1.isUndef()) { + if (N1.getOpcode() == ISD::POISON) + return N0; + if (VT.isFixedLengthVector()) { + unsigned SubVecNumElts = N1.getValueType().getVectorNumElements(); + APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(), InsIdx, + InsIdx + SubVecNumElts); + if (DAG.isGuaranteedNotToBePoison(N0, EltMask)) + return N0; + } + return DAG.getFreeze(N0); + } - // If this is an insert of an extracted vector into an undef vector, we can - // just use the input to the extract if the types match, and can simplify + // If this is an insert of an extracted vector into an undef/poison vector, we + // can just use the input to the extract if the types match, and can simplify // in some cases even if they don't. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(1) == N2) { + EVT N1VT = N1.getValueType(); EVT SrcVT = N1.getOperand(0).getValueType(); - if (SrcVT == VT) - return N1.getOperand(0); + if (SrcVT == VT) { + // Need to ensure that result isn't more poisonous if skipping both the + // extract+insert. + if (N0.getOpcode() == ISD::POISON) + return N1.getOperand(0); + if (VT.isFixedLengthVector() && N1VT.isFixedLengthVector()) { + unsigned SubVecNumElts = N1VT.getVectorNumElements(); + APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(), InsIdx, + InsIdx + SubVecNumElts); + if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0), ~EltMask)) + return N1.getOperand(0); + } else if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0))) + return N1.getOperand(0); + } // TODO: To remove the zero check, need to adjust the offset to // a multiple of the new src type. if (isNullConstant(N2)) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 6df21b624137f..3b4802d4b47b1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7952,23 +7952,42 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except // for scalable vectors where we will generate appropriate code to // deal with out-of-bounds cases correctly. - if (N3C && N1.getValueType().isFixedLengthVector() && - N3C->getZExtValue() >= N1.getValueType().getVectorNumElements()) + if (N3C && VT.isFixedLengthVector() && + N3C->getZExtValue() >= VT.getVectorNumElements()) return getUNDEF(VT); // Undefined index can be assumed out-of-bounds, so that's UNDEF too. if (N3.isUndef()) return getUNDEF(VT); - // If the inserted element is an UNDEF, just use the input vector. - if (N2.isUndef()) + // If inserting poison, just use the input vector. + if (N2.getOpcode() == ISD::POISON) return N1; + // Inserting undef into undef/poison is still undef. + if (N2.getOpcode() == ISD::UNDEF && N1.isUndef()) + return getUNDEF(VT); + + // If the inserted element is an UNDEF, just use the input vector. + // But not if skipping the insert could make the result more poisonous. + if (N2.isUndef()) { + if (N3C && VT.isFixedLengthVector()) { + APInt EltMask = + APInt::getOneBitSet(VT.getVectorNumElements(), N3C->getZExtValue()); + if (isGuaranteedNotToBePoison(N1, EltMask)) + return N1; + } else if (isGuaranteedNotToBePoison(N1)) + return N1; + } break; } case ISD::INSERT_SUBVECTOR: { - // Inserting undef into undef is still undef. - if (N1.isUndef() && N2.isUndef()) + // If inserting poison, just use the input vector, + if (N2.getOpcode() == ISD::POISON) + return N1; + + // Inserting undef into undef/poison is still undef. + if (N2.getOpcode() == ISD::UNDEF && N1.isUndef()) return getUNDEF(VT); EVT N2VT = N2.getValueType(); @@ -7997,11 +8016,37 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == N2VT) return N2; - // If this is an insert of an extracted vector into an undef vector, we - // can just use the input to the extract. + // If this is an insert of an extracted vector into an undef/poison vector, + // we can just use the input to the extract. But not if skipping the + // extract+insert could make the result more poisonous. if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR && - N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) - return N2.getOperand(0); + N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) { + if (N1.getOpcode() == ISD::POISON) + return N2.getOperand(0); + if (VT.isFixedLengthVector() && N2VT.isFixedLengthVector()) { + unsigned LoBit = N3->getAsZExtVal(); + unsigned HiBit = LoBit + N2VT.getVectorNumElements(); + APInt EltMask = + APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit); + if (isGuaranteedNotToBePoison(N2.getOperand(0), ~EltMask)) + return N2.getOperand(0); + } else if (isGuaranteedNotToBePoison(N2.getOperand(0))) + return N2.getOperand(0); + } + + // If the inserted subvector is UNDEF, just use the input vector. + // But not if skipping the insert could make the result more poisonous. + if (N2.isUndef()) { + if (VT.isFixedLengthVector()) { + unsigned LoBit = N3->getAsZExtVal(); + unsigned HiBit = LoBit + N2VT.getVectorNumElements(); + APInt EltMask = + APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit); + if (isGuaranteedNotToBePoison(N1, EltMask)) + return N1; + } else if (isGuaranteedNotToBePoison(N1)) + return N1; + } break; } case ISD::BITCAST: diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 000f8cc6786a5..4775b2501f8a5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3433,8 +3433,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } case ISD::INSERT_SUBVECTOR: { - // Demand any elements from the subvector and the remainder from the src its - // inserted into. + // Demand any elements from the subvector and the remainder from the src it + // is inserted into. SDValue Src = Op.getOperand(0); SDValue Sub = Op.getOperand(1); uint64_t Idx = Op.getConstantOperandVal(2); @@ -3443,6 +3443,10 @@ bool TargetLowering::SimplifyDemandedVectorElts( APInt DemandedSrcElts = DemandedElts; DemandedSrcElts.clearBits(Idx, Idx + NumSubElts); + // If none of the sub operand elements are demanded, bypass the insert. + if (!DemandedSubElts) + return TLO.CombineTo(Op, Src); + APInt SubUndef, SubZero; if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO, Depth + 1)) diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll index 82802c79c7085..c6fff3e3d3181 100644 --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ define void @widen_f16_build_vector(ptr %addr) { ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 // =0x33ee -; CHECK-NEXT: movk w8, #13294, lsl #16 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: dup v0.4h, w8 +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret store <2 x half> , ptr %addr, align 2 ret void diff --git a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll index 34899cb47dba3..545da98034527 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll @@ -94,16 +94,14 @@ define i32 @combine_undef_add_8xi32(i32 %a, i32 %b, i32 %c, i32 %d) local_unname ; CHECK-LABEL: combine_undef_add_8xi32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: mov v1.s[1], w1 -; CHECK-NEXT: uhadd v0.4h, v0.4h, v0.4h ; CHECK-NEXT: mov v1.s[2], w2 ; CHECK-NEXT: mov v1.s[3], w3 -; CHECK-NEXT: xtn v2.4h, v1.4s -; CHECK-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-NEXT: uhadd v1.4h, v2.4h, v1.4h -; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: uaddlv s0, v1.8h +; CHECK-NEXT: uzp2 v2.8h, v1.8h, v0.8h +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uhadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uaddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %a1 = insertelement <8 x i32> poison, i32 %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll index 9efe0b33910c8..2905d707bdd09 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -37,6 +37,10 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -59,8 +63,15 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h +; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 +; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 +; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -71,6 +82,10 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -89,6 +104,10 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -107,6 +126,10 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -150,6 +173,10 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -172,8 +199,15 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s +; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 +; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 +; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -184,6 +218,10 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -202,6 +240,10 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -220,6 +262,10 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -264,6 +310,10 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -286,8 +336,15 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d +; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 +; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -298,6 +355,10 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -316,6 +377,10 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -334,6 +399,10 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll index 9cebbc4aab9b7..0e95da31c13cc 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -36,6 +36,10 @@ define void @select_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -58,8 +62,15 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b -; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z1.b -; VBITS_GE_256-NEXT: sel z1.b, p2, z2.b, z3.b +; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.b +; VBITS_GE_256-NEXT: mov z5.b, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1 +; VBITS_GE_256-NEXT: and z5.b, z5.b, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z4.b, #0 +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z5.b, #0 +; VBITS_GE_256-NEXT: sel z0.b, p2, z0.b, z1.b +; VBITS_GE_256-NEXT: sel z1.b, p1, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -70,6 +81,10 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; VBITS_GE_512-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.b +; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -88,6 +103,10 @@ define void @select_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -106,6 +125,10 @@ define void @select_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -149,6 +172,10 @@ define void @select_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -171,8 +198,15 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h +; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 +; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 +; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -183,6 +217,10 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -201,6 +239,10 @@ define void @select_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -219,6 +261,10 @@ define void @select_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -262,6 +308,10 @@ define void @select_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -284,8 +334,15 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s +; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 +; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 +; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -296,6 +353,10 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -314,6 +375,10 @@ define void @select_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -332,6 +397,10 @@ define void @select_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -376,6 +445,10 @@ define void @select_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -398,8 +471,15 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d +; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 +; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -410,6 +490,10 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -428,6 +512,10 @@ define void @select_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -446,6 +534,10 @@ define void @select_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 093e6cd9328c8..ebd32c73ec65b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1198,11 +1198,15 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) # ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] ; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index ec0693a541e44..8b845dff64ffe 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -199,6 +199,13 @@ define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z4.h, z4.h, #0x1 +; CHECK-NEXT: and z5.h, z5.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -434,6 +441,13 @@ define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z4.s, z4.s, #0x1 +; CHECK-NEXT: and z5.s, z5.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -558,6 +572,13 @@ define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: and z5.d, z5.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 39701131d7db6..12b7886d76c70 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -293,6 +293,13 @@ define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b +; CHECK-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: and z4.b, z4.b, #0x1 +; CHECK-NEXT: and z5.b, z5.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p0/z, z4.b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z5.b, #0 ; CHECK-NEXT: mov z0.b, p1/m, z1.b ; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -697,6 +704,13 @@ define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z4.h, z4.h, #0x1 +; CHECK-NEXT: and z5.h, z5.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -911,6 +925,13 @@ define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z4.s, z4.s, #0x1 +; CHECK-NEXT: and z5.s, z5.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -1044,6 +1065,13 @@ define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: and z5.d, z5.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index aba9056c78cda..5aa3a246d7616 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING1 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING2 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN ; Check that the default value enables the web folding and @@ -8,20 +8,35 @@ ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) { -; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING-NEXT: vse32.v v10, (a0) -; NO_FOLDING-NEXT: vse32.v v11, (a1) -; NO_FOLDING-NEXT: vse32.v v8, (a2) -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING1-NEXT: vse32.v v10, (a0) +; NO_FOLDING1-NEXT: vse32.v v11, (a1) +; NO_FOLDING1-NEXT: vse32.v v8, (a2) +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 +; NO_FOLDING2-NEXT: vse32.v v9, (a0) +; NO_FOLDING2-NEXT: vse32.v v11, (a1) +; NO_FOLDING2-NEXT: vse32.v v8, (a2) +; NO_FOLDING2-NEXT: ret ; ; ZVFH-LABEL: vfwmul_v2f116_multiple_users: ; ZVFH: # %bb.0: @@ -61,20 +76,35 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, } define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) { -; NO_FOLDING-LABEL: vfwmul_v2f32_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING-NEXT: vse64.v v10, (a0) -; NO_FOLDING-NEXT: vse64.v v11, (a1) -; NO_FOLDING-NEXT: vse64.v v8, (a2) -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING1-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING1-NEXT: vse64.v v10, (a0) +; NO_FOLDING1-NEXT: vse64.v v11, (a1) +; NO_FOLDING1-NEXT: vse64.v v8, (a2) +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 +; NO_FOLDING2-NEXT: vse64.v v9, (a0) +; NO_FOLDING2-NEXT: vse64.v v11, (a1) +; NO_FOLDING2-NEXT: vse64.v v8, (a2) +; NO_FOLDING2-NEXT: ret ; ; FOLDING-LABEL: vfwmul_v2f32_multiple_users: ; FOLDING: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll index 227a428831b60..b093e9e35edad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; Check that the default value enables the web folding and @@ -16,21 +16,38 @@ ; We need the web size to be at least 3 for the folding to happen, because ; %c has 3 uses. define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) { -; NO_FOLDING-LABEL: vwmul_v2i16_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING-NEXT: vle8.v v8, (a0) -; NO_FOLDING-NEXT: vle8.v v9, (a1) -; NO_FOLDING-NEXT: vle8.v v10, (a2) -; NO_FOLDING-NEXT: vsext.vf2 v11, v8 -; NO_FOLDING-NEXT: vsext.vf2 v8, v9 -; NO_FOLDING-NEXT: vsext.vf2 v9, v10 -; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 -; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 -; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 -; NO_FOLDING-NEXT: vor.vv v8, v8, v10 -; NO_FOLDING-NEXT: vor.vv v8, v8, v9 -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vwmul_v2i16_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING1-NEXT: vle8.v v8, (a0) +; NO_FOLDING1-NEXT: vle8.v v9, (a1) +; NO_FOLDING1-NEXT: vle8.v v10, (a2) +; NO_FOLDING1-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING1-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING1-NEXT: vsext.vf2 v9, v10 +; NO_FOLDING1-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING1-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING1-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING1-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING1-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vwmul_v2i16_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vle8.v v8, (a0) +; NO_FOLDING2-NEXT: vle8.v v9, (a1) +; NO_FOLDING2-NEXT: vle8.v v10, (a2) +; NO_FOLDING2-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING2-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING2-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; NO_FOLDING2-NEXT: vwadd.wv v9, v11, v10 +; NO_FOLDING2-NEXT: vwsub.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING2-NEXT: vor.vv v8, v8, v11 +; NO_FOLDING2-NEXT: ret ; ; FOLDING-LABEL: vwmul_v2i16_multiple_users: ; FOLDING: # %bb.0: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 4dd9173e2d418..38e42c137e3a9 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -663,8 +663,8 @@ define void @vld3_v2i8(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldrd r0, r2, [r0] -; CHECK-NEXT: strd r0, r2, [sp] +; CHECK-NEXT: ldrd r2, r0, [r0] +; CHECK-NEXT: strd r2, r0, [sp] ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vmov.u16 r0, q0[4] diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 595f8491b405c..1df4e9f47f21b 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -262,37 +262,54 @@ define <4 x float> @merge_4f32_f32_45zz(ptr %ptr) nounwind uwtable noinline ssp define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp { ; SSE2-LABEL: merge_4f32_f32_012u: ; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_012u: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_012u: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX-NEXT: retq ; ; X86-SSE1-LABEL: merge_4f32_f32_012u: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X86-SSE1-NEXT: retl ; ; X86-SSE41-LABEL: merge_4f32_f32_012u: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; X86-SSE41-NEXT: retl %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 2 @@ -309,37 +326,54 @@ define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp { ; SSE2-LABEL: merge_4f32_f32_019u: ; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_019u: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_019u: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX-NEXT: retq ; ; X86-SSE1-LABEL: merge_4f32_f32_019u: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X86-SSE1-NEXT: retl ; ; X86-SSE41-LABEL: merge_4f32_f32_019u: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; X86-SSE41-NEXT: retl %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 9 diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll index d8a010bacc683..10b7ad285fa7b 100644 --- a/llvm/test/CodeGen/X86/mmx-build-vector.ll +++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll @@ -2,11 +2,11 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx | FileCheck %s --check-prefixes=X86,X86-MMX ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s --check-prefixes=X86,X86-SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2 | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=X64,X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s --check-prefixes=X64,X64-SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) @@ -290,15 +290,21 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { ; X86-LABEL: build_v4i16_012u: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 -; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: movd %esi, %mm1 +; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X86-NEXT: movd %edx, %mm0 +; X86-NEXT: movd %ecx, %mm2 +; X86-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X86-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0] ; X86-NEXT: paddd %mm2, %mm2 ; X86-NEXT: movq %mm2, (%eax) +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: build_v4i16_012u: @@ -475,45 +481,107 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ret void } + +; Recursion depth limit in isGuaranteedNotToBeUndefOrPoison prevents llc from +; detecting that we insert an "undef" element in a position that already is +; undef. OTOH, opt would optimize away that insertelement operation from the +; IR, so maybe that isn't a problem in reality. define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-LABEL: build_v8i8_0123zzzu: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 -; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 -; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] -; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X86-NEXT: pxor %mm0, %mm0 -; X86-NEXT: pxor %mm1, %mm1 -; X86-NEXT: punpcklbw %mm1, %mm1 # mm1 = mm1[0,0,1,1,2,2,3,3] -; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] -; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: paddd %mm2, %mm2 -; X86-NEXT: movq %mm2, (%eax) -; X86-NEXT: retl +; X86-MMX-LABEL: build_v8i8_0123zzzu: +; X86-MMX: # %bb.0: +; X86-MMX-NEXT: pushl %ebp +; X86-MMX-NEXT: movl %esp, %ebp +; X86-MMX-NEXT: pushl %esi +; X86-MMX-NEXT: andl $-8, %esp +; X86-MMX-NEXT: subl $16, %esp +; X86-MMX-NEXT: movl 8(%ebp), %eax +; X86-MMX-NEXT: movzbl 20(%ebp), %edx +; X86-MMX-NEXT: movzbl 24(%ebp), %ecx +; X86-MMX-NEXT: shll $8, %ecx +; X86-MMX-NEXT: orl %edx, %ecx +; X86-MMX-NEXT: shll $16, %ecx +; X86-MMX-NEXT: movzbl 12(%ebp), %edx +; X86-MMX-NEXT: movzbl 16(%ebp), %esi +; X86-MMX-NEXT: shll $8, %esi +; X86-MMX-NEXT: orl %edx, %esi +; X86-MMX-NEXT: movzwl %si, %edx +; X86-MMX-NEXT: orl %ecx, %edx +; X86-MMX-NEXT: movzbl %al, %ecx +; X86-MMX-NEXT: shll $24, %ecx +; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-MMX-NEXT: movl %edx, (%esp) +; X86-MMX-NEXT: movq (%esp), %mm0 +; X86-MMX-NEXT: paddd %mm0, %mm0 +; X86-MMX-NEXT: movq %mm0, (%eax) +; X86-MMX-NEXT: leal -4(%ebp), %esp +; X86-MMX-NEXT: popl %esi +; X86-MMX-NEXT: popl %ebp +; X86-MMX-NEXT: retl ; -; X64-LABEL: build_v8i8_0123zzzu: -; X64: # %bb.0: -; X64-NEXT: movd %r8d, %mm0 -; X64-NEXT: movd %ecx, %mm1 -; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] -; X64-NEXT: movd %edx, %mm0 -; X64-NEXT: movd %esi, %mm2 -; X64-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] -; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X64-NEXT: pxor %mm0, %mm0 -; X64-NEXT: pxor %mm1, %mm1 -; X64-NEXT: punpcklbw %mm1, %mm1 # mm1 = mm1[0,0,1,1,2,2,3,3] -; X64-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] -; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] -; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X64-NEXT: paddd %mm2, %mm2 -; X64-NEXT: movq %mm2, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: build_v8i8_0123zzzu: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: shll $8, %edx +; X86-SSE-NEXT: orl %ecx, %edx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: shll $16, %ecx +; X86-SSE-NEXT: orl %edx, %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: shll $24, %edx +; X86-SSE-NEXT: orl %ecx, %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE2-LABEL: build_v8i8_0123zzzu: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movzbl %sil, %eax +; X64-SSE2-NEXT: movzbl %dl, %edx +; X64-SSE2-NEXT: shll $8, %edx +; X64-SSE2-NEXT: orl %eax, %edx +; X64-SSE2-NEXT: movzbl %cl, %eax +; X64-SSE2-NEXT: shll $16, %eax +; X64-SSE2-NEXT: orl %edx, %eax +; X64-SSE2-NEXT: shll $24, %r8d +; X64-SSE2-NEXT: orl %eax, %r8d +; X64-SSE2-NEXT: movd %r8d, %xmm0 +; X64-SSE2-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE2-NEXT: paddd %mm0, %mm0 +; X64-SSE2-NEXT: movq %mm0, (%rdi) +; X64-SSE2-NEXT: retq +; +; X64-SSSE3-LABEL: build_v8i8_0123zzzu: +; X64-SSSE3: # %bb.0: +; X64-SSSE3-NEXT: movzbl %sil, %eax +; X64-SSSE3-NEXT: movzbl %dl, %edx +; X64-SSSE3-NEXT: shll $8, %edx +; X64-SSSE3-NEXT: orl %eax, %edx +; X64-SSSE3-NEXT: movzbl %cl, %eax +; X64-SSSE3-NEXT: shll $16, %eax +; X64-SSSE3-NEXT: orl %edx, %eax +; X64-SSSE3-NEXT: shll $24, %r8d +; X64-SSSE3-NEXT: orl %eax, %r8d +; X64-SSSE3-NEXT: movd %r8d, %xmm0 +; X64-SSSE3-NEXT: movdq2q %xmm0, %mm0 +; X64-SSSE3-NEXT: paddd %mm0, %mm0 +; X64-SSSE3-NEXT: movq %mm0, (%rdi) +; X64-SSSE3-NEXT: retq +; +; X64-AVX-LABEL: build_v8i8_0123zzzu: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: movzbl %sil, %eax +; X64-AVX-NEXT: vmovd %eax, %xmm0 +; X64-AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 +; X64-AVX-NEXT: paddd %mm0, %mm0 +; X64-AVX-NEXT: movq %mm0, (%rdi) +; X64-AVX-NEXT: retq %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 %a1, i32 1 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2 @@ -558,22 +626,61 @@ define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, } define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-LABEL: build_v8i8_0zzzzzzu: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd %eax, %mm0 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: retl +; X86-MMX-LABEL: build_v8i8_0zzzzzzu: +; X86-MMX: # %bb.0: +; X86-MMX-NEXT: pushl %ebp +; X86-MMX-NEXT: movl %esp, %ebp +; X86-MMX-NEXT: andl $-8, %esp +; X86-MMX-NEXT: subl $8, %esp +; X86-MMX-NEXT: movl 8(%ebp), %eax +; X86-MMX-NEXT: movzbl 12(%ebp), %ecx +; X86-MMX-NEXT: movl %ecx, (%esp) +; X86-MMX-NEXT: movzbl %al, %ecx +; X86-MMX-NEXT: shll $24, %ecx +; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-MMX-NEXT: movq (%esp), %mm0 +; X86-MMX-NEXT: paddd %mm0, %mm0 +; X86-MMX-NEXT: movq %mm0, (%eax) +; X86-MMX-NEXT: movl %ebp, %esp +; X86-MMX-NEXT: popl %ebp +; X86-MMX-NEXT: retl ; -; X64-LABEL: build_v8i8_0zzzzzzu: -; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movd %eax, %mm0 -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: build_v8i8_0zzzzzzu: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE2-LABEL: build_v8i8_0zzzzzzu: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movzbl %sil, %eax +; X64-SSE2-NEXT: movd %eax, %xmm0 +; X64-SSE2-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE2-NEXT: paddd %mm0, %mm0 +; X64-SSE2-NEXT: movq %mm0, (%rdi) +; X64-SSE2-NEXT: retq +; +; X64-SSSE3-LABEL: build_v8i8_0zzzzzzu: +; X64-SSSE3: # %bb.0: +; X64-SSSE3-NEXT: movzbl %sil, %eax +; X64-SSSE3-NEXT: movd %eax, %xmm0 +; X64-SSSE3-NEXT: movdq2q %xmm0, %mm0 +; X64-SSSE3-NEXT: paddd %mm0, %mm0 +; X64-SSSE3-NEXT: movq %mm0, (%rdi) +; X64-SSSE3-NEXT: retq +; +; X64-AVX-LABEL: build_v8i8_0zzzzzzu: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: movzbl %sil, %eax +; X64-AVX-NEXT: vmovd %eax, %xmm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 +; X64-AVX-NEXT: paddd %mm0, %mm0 +; X64-AVX-NEXT: movq %mm0, (%rdi) +; X64-AVX-NEXT: retq %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 0, i32 1 %3 = insertelement <8 x i8> %2, i8 0, i32 2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index efa6c16fbf4eb..9a1ce62a45834 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3051,16 +3051,21 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movsbl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65531,65531,65531,65531] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsbl (%rdx), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movsbl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; @@ -3069,21 +3074,26 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: movsbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSSE3-NEXT: movsbl (%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movsbl (%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: palignr {{.*#+}} xmm3 = xmm1[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65531,65531,65531,65531] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_scalar_to_vector_extract: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; SSE41-NEXT: pextrw $4, %xmm0, %eax ; SSE41-NEXT: pextrw $7, %xmm0, %ecx ; SSE41-NEXT: pxor %xmm0, %xmm0 @@ -3099,7 +3109,8 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; ; AVX-LABEL: shuffle_scalar_to_vector_extract: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-NEXT: vpextrw $4, %xmm0, %eax ; AVX-NEXT: vpextrw $7, %xmm0, %ecx ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -3572,68 +3583,78 @@ define void @SpinningCube() { ; SSE2-LABEL: SpinningCube: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSE2-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSE2-NEXT: addps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSE2-NEXT: addps %xmm0, %xmm3 +; SSE2-NEXT: movaps %xmm3, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: SpinningCube: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSSE3-NEXT: xorps %xmm3, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSSE3-NEXT: addps %xmm3, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: addps %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) +; SSSE3-NEXT: xorps %xmm0, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSSE3-NEXT: addps %xmm0, %xmm3 +; SSSE3-NEXT: movaps %xmm3, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: addps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm0, (%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] -; SSE41-NEXT: addps %xmm3, %xmm4 -; SSE41-NEXT: movaps %xmm4, (%rax) -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] -; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE41-NEXT: insertps {{.*#+}} xmm1 = zero,zero,zero,xmm0[0] +; SSE41-NEXT: movaps {{.*#+}} xmm2 = [0.0E+0,0.0E+0,-2.0E+0,u] +; SSE41-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[0] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; SSE41-NEXT: addps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] ; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],zero,xmm4[3] ; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovaps %xmm2, (%rax) ; AVX-NEXT: vbroadcastss (%rax), %xmm2 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 572ed314ab31d..6b713c7a43e51 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -3273,10 +3273,10 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX-NEXT: vpaddb 48(%rdi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0