diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3ce45e0e43bf4..804823af8daf0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11142,11 +11142,29 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { SDValue DAGCombiner::visitCTPOP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + unsigned NumBits = VT.getScalarSizeInBits(); SDLoc DL(N); // fold (ctpop c1) -> c2 if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTPOP, DL, VT, {N0})) return C; + + // If the upper bits are known to be zero, then see if its profitable to + // only count the lower bits. + if (VT.isScalarInteger() && NumBits > 8 && (NumBits & 1) == 0) { + EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), NumBits / 2); + if (hasOperation(ISD::CTPOP, HalfVT) && + TLI.isTypeDesirableForOp(ISD::CTPOP, HalfVT) && + TLI.isTruncateFree(N0, HalfVT) && TLI.isZExtFree(HalfVT, VT)) { + APInt UpperBits = APInt::getHighBitsSet(NumBits, NumBits / 2); + if (DAG.MaskedValueIsZero(N0, UpperBits)) { + SDValue PopCnt = DAG.getNode(ISD::CTPOP, DL, HalfVT, + DAG.getZExtOrTrunc(N0, DL, HalfVT)); + return DAG.getZExtOrTrunc(PopCnt, DL, VT); + } + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 1346678e51e3d..3b9c3e3ba1752 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -452,12 +452,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_and_b32 s4, s8, 0xff ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b32 s4, s8, 1 -; SI-NEXT: s_mov_b32 s5, 0 -; SI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; SI-NEXT: s_add_i32 s4, s6, s4 +; SI-NEXT: s_bcnt1_i32_b32 s4, s4 +; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; SI-NEXT: s_add_i32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -470,12 +469,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_and_b32 s4, s8, 0xff ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_and_b32 s4, s8, 1 -; VI-NEXT: s_mov_b32 s5, 0 -; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; VI-NEXT: s_add_i32 s4, s6, s4 +; VI-NEXT: s_bcnt1_i32_b32 s4, s4 +; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; VI-NEXT: s_add_i32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll index abbcf22f77e43..e0a96a9f98879 100644 --- a/llvm/test/CodeGen/X86/ctpop-mask.ll +++ b/llvm/test/CodeGen/X86/ctpop-mask.ll @@ -25,7 +25,7 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone { ; X64-POPCOUNT-LABEL: ctpop_mask2: ; X64-POPCOUNT: # %bb.0: ; X64-POPCOUNT-NEXT: andl $3, %edi -; X64-POPCOUNT-NEXT: popcntq %rdi, %rax +; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; ; X86-NO-POPCOUNT-LABEL: ctpop_mask2: @@ -189,7 +189,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone { ; X64-POPCOUNT-LABEL: ctpop_mask4: ; X64-POPCOUNT: # %bb.0: ; X64-POPCOUNT-NEXT: andl $15, %edi -; X64-POPCOUNT-NEXT: popcntq %rdi, %rax +; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; ; X86-NO-POPCOUNT-LABEL: ctpop_mask4: @@ -271,7 +271,7 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone { ; X64-POPCOUNT-LABEL: ctpop_mask5: ; X64-POPCOUNT: # %bb.0: ; X64-POPCOUNT-NEXT: andl $31, %edi -; X64-POPCOUNT-NEXT: popcntq %rdi, %rax +; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; ; X86-NO-POPCOUNT-LABEL: ctpop_mask5: @@ -392,7 +392,7 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone { ; X64-POPCOUNT-LABEL: ctpop_shifted_mask6: ; X64-POPCOUNT: # %bb.0: ; X64-POPCOUNT-NEXT: andl $26112, %edi # imm = 0x6600 -; X64-POPCOUNT-NEXT: popcntq %rdi, %rax +; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6: @@ -556,7 +556,7 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone { ; X64-POPCOUNT-LABEL: ctpop_shifted_mask8: ; X64-POPCOUNT: # %bb.0: ; X64-POPCOUNT-NEXT: andl $65280, %edi # imm = 0xFF00 -; X64-POPCOUNT-NEXT: popcntq %rdi, %rax +; X64-POPCOUNT-NEXT: popcntl %edi, %eax ; X64-POPCOUNT-NEXT: retq ; ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8: