diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 429e2b42ab5ca..b69674d9be4e4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6128,10 +6128,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); + uint64_t InsertIdx = N.getConstantOperandVal(2); + // Handle CONCAT(SUB0, SUB1). + // Limit this to vXi64 vector cases to make the most of cross lane shuffles. + if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) && + NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(0).isUndef() && + Src.getOperand(1).getValueType() == SubVT && + Src.getConstantOperandVal(2) == 0 && + SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) { + for (int i = 0; i != (int)NumSubElts; ++i) + Mask.push_back(i); + for (int i = 0; i != (int)NumSubElts; ++i) + Mask.push_back(i + NumElts); + Ops.push_back(Src.getOperand(1)); + Ops.push_back(Sub); + return true; + } if (!N->isOnlyUserOf(Sub.getNode())) return false; SDValue SubBC = peekThroughBitcasts(Sub); - uint64_t InsertIdx = N.getConstantOperandVal(2); // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) { @@ -6154,21 +6170,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Ops.push_back(SubBCSrc); return true; } - // Handle CONCAT(SUB0, SUB1). - // Limit this to vXi64 vector cases to make the most of cross lane shuffles. - if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) && - NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && - Src.getOperand(0).isUndef() && - Src.getOperand(1).getValueType() == SubVT && - Src.getConstantOperandVal(2) == 0) { - for (int i = 0; i != (int)NumSubElts; ++i) - Mask.push_back(i); - for (int i = 0; i != (int)NumSubElts; ++i) - Mask.push_back(i + NumElts); - Ops.push_back(Src.getOperand(1)); - Ops.push_back(Sub); - return true; - } // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector SubMask; SmallVector SubInputs; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 758061d456807..29d80e16bb26e 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -6897,9 +6897,10 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,4,21,22,23,0,25,26,27,4,29,30,31] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7098,9 +7099,10 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq