From 461686633c8ef53da99b1fb4da700e5fbb5e91cc Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 4 Nov 2025 14:17:55 +0000 Subject: [PATCH 1/2] [X86] narrowBitOpRMW - allow additional uses of the BTC/R/S result If there are additional uses of the bit twiddled value as well as the rmw store, we can replace them with a (re)loaded copy of the full width integer value after the store. There's some memory op chain handling to handle here - the additional (re)load is chained after the new store and then any dependencies of the original store are chained after the (re)load. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 19 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 401 ++++--------------- 2 files changed, 89 insertions(+), 331 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6edf0185df813..4da1bb0c81db1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53369,8 +53369,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, // // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) SDValue SrcVal, InsertBit, ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Value(SrcVal), + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || @@ -53441,8 +53440,20 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); } - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value and ensure that any chained dependencies on the + // original store are updated to come AFTER the new load. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + DAG.ReplaceAllUsesWith(SDValue(St, 0), NewLoad.getValue(1)); + } + return NewStore; } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index bcb14fd25b975..32d225273a6e1 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -906,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_cmpz_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 36(%esp,%esi), %eax -; X86-NEXT: movl 40(%esp,%esi), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 32(%esp,%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: xorl 12(%ecx), %esi -; X86-NEXT: xorl 8(%ecx), %edx -; X86-NEXT: xorl 4(%ecx), %eax -; X86-NEXT: xorl (%ecx), %edi -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl $96, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: xorl %edx, (%eax,%ecx) +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: orl 12(%eax), %edx +; X86-NEXT: orl 8(%eax), %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: complement_cmpz_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: xorq 8(%rdi), %rdx -; SSE-NEXT: xorq (%rdi), %rax -; SSE-NEXT: movq %rax, (%rdi) -; SSE-NEXT: movq %rdx, 8(%rdi) -; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: shll %cl, %eax +; SSE-NEXT: andl $96, %ecx +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: xorl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: orq 8(%rdi), %rax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; -; AVX2-LABEL: complement_cmpz_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: xorq 8(%rdi), %rdx -; AVX2-NEXT: xorq (%rdi), %rax -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: movq %rdx, 8(%rdi) -; AVX2-NEXT: orq %rdx, %rax -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_cmpz_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %edx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rdx, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rdx, %rsi -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: xorq 8(%rdi), %rsi -; AVX512-NEXT: xorq (%rdi), %rdx -; AVX512-NEXT: movq %rdx, (%rdi) -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; AVX-LABEL: complement_cmpz_i128: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: shlxl %esi, %eax, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: xorl %eax, (%rdi,%rsi) +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: orq 8(%rdi), %rax +; AVX-NEXT: setne %al +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -1088,247 +1019,63 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind ; X86-LABEL: chain_reset_i256: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $112, %esp -; X86-NEXT: movzbl 20(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $28, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 72(%esp,%eax), %edx -; X86-NEXT: movl 76(%esp,%eax), %edi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esp,%eax), %edx -; X86-NEXT: movl 84(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl 64(%esp,%eax), %edi -; X86-NEXT: movl 88(%esp,%eax), %esi -; X86-NEXT: movl 92(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: notl %eax -; X86-NEXT: notl %edx -; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: notl %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: andl 12(%ecx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 8(%ecx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 20(%ecx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl 16(%ecx), %edx -; X86-NEXT: andl 28(%ecx), %eax -; X86-NEXT: andl 24(%ecx), %ebx -; X86-NEXT: andl 4(%ecx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl (%ecx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, 24(%ecx) -; X86-NEXT: movl %eax, 28(%ecx) -; X86-NEXT: movl %edx, 16(%ecx) -; X86-NEXT: movl %edi, 20(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 12(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 4(%ecx) -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2, %edi +; X86-NEXT: roll %cl, %edi +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $28, %ecx +; X86-NEXT: andl %edi, (%esi,%ecx) +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 12(%esi), %ebp +; X86-NEXT: orl 28(%esi), %ebp +; X86-NEXT: orl 20(%esi), %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl 24(%esi), %ebx +; X86-NEXT: movl 16(%esi), %ebp +; X86-NEXT: orl %edi, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl %edi, (%edx) ; X86-NEXT: movl (%eax), %eax +; X86-NEXT: orl %ecx, %ebp ; X86-NEXT: jne .LBB23_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: .LBB23_2: -; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: chain_reset_i256: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: shrb $3, %al -; SSE-NEXT: andb $24, %al -; SSE-NEXT: negb %al -; SSE-NEXT: movsbq %al, %r10 -; SSE-NEXT: movq -24(%rsp,%r10), %r8 -; SSE-NEXT: movq -16(%rsp,%r10), %rax -; SSE-NEXT: shldq %cl, %r8, %rax -; SSE-NEXT: movq -32(%rsp,%r10), %r9 -; SSE-NEXT: shldq %cl, %r9, %r8 -; SSE-NEXT: movq -40(%rsp,%r10), %r10 -; SSE-NEXT: shldq %cl, %r10, %r9 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %r10 -; SSE-NEXT: notq %r8 -; SSE-NEXT: notq %rax -; SSE-NEXT: notq %r10 -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: andq 16(%rdi), %r8 -; SSE-NEXT: andq 8(%rdi), %r9 -; SSE-NEXT: andq (%rdi), %r10 -; SSE-NEXT: movq %r8, 16(%rdi) -; SSE-NEXT: movq %rax, 24(%rdi) -; SSE-NEXT: movq %r10, (%rdi) -; SSE-NEXT: movq %r9, 8(%rdi) -; SSE-NEXT: orq %rax, %r9 -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: movl (%rsi), %eax -; SSE-NEXT: movl %r10d, (%rsi) -; SSE-NEXT: movl (%rdx), %ecx -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: cmovnel %ecx, %eax -; SSE-NEXT: retq -; -; AVX2-LABEL: chain_reset_i256: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $24, %al -; AVX2-NEXT: negb %al -; AVX2-NEXT: movsbq %al, %rax -; AVX2-NEXT: movq -32(%rsp,%rax), %r8 -; AVX2-NEXT: movq -24(%rsp,%rax), %r9 -; AVX2-NEXT: movq %r9, %r10 -; AVX2-NEXT: shldq %cl, %r8, %r10 -; AVX2-NEXT: movq -40(%rsp,%rax), %r11 -; AVX2-NEXT: movq -16(%rsp,%rax), %rax -; AVX2-NEXT: shldq %cl, %r9, %rax -; AVX2-NEXT: shldq %cl, %r11, %r8 -; AVX2-NEXT: andnq 24(%rdi), %rax, %rax -; AVX2-NEXT: andnq 16(%rdi), %r10, %r9 -; AVX2-NEXT: andnq 8(%rdi), %r8, %r8 -; AVX2-NEXT: shlxq %rcx, %r11, %rcx -; AVX2-NEXT: andnq (%rdi), %rcx, %rcx -; AVX2-NEXT: movq %r9, 16(%rdi) -; AVX2-NEXT: movq %rax, 24(%rdi) -; AVX2-NEXT: movq %rcx, (%rdi) -; AVX2-NEXT: movq %r8, 8(%rdi) -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: orq %rcx, %r9 -; AVX2-NEXT: movl (%rsi), %eax -; AVX2-NEXT: movl %ecx, (%rsi) -; AVX2-NEXT: movl (%rdx), %ecx -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: orq %r8, %r9 -; AVX2-NEXT: cmovnel %ecx, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: chain_reset_i256: -; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: shrb $3, %al -; AVX512-NEXT: andb $24, %al -; AVX512-NEXT: negb %al -; AVX512-NEXT: movsbq %al, %rax -; AVX512-NEXT: movq -40(%rsp,%rax), %r8 -; AVX512-NEXT: movq -32(%rsp,%rax), %r9 -; AVX512-NEXT: movq -24(%rsp,%rax), %r10 -; AVX512-NEXT: movq %r10, %r11 -; AVX512-NEXT: shldq %cl, %r9, %r11 -; AVX512-NEXT: movq -16(%rsp,%rax), %rax -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: shlxq %rcx, %r8, %r10 -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %r8, %r9 -; AVX512-NEXT: andnq 24(%rdi), %rax, %rax -; AVX512-NEXT: andnq 16(%rdi), %r11, %rcx -; AVX512-NEXT: andnq 8(%rdi), %r9, %r8 -; AVX512-NEXT: andnq (%rdi), %r10, %r9 -; AVX512-NEXT: movq %rcx, 16(%rdi) -; AVX512-NEXT: movq %rax, 24(%rdi) -; AVX512-NEXT: movq %r9, (%rdi) -; AVX512-NEXT: movq %r8, 8(%rdi) -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: movl (%rsi), %eax -; AVX512-NEXT: movl %r9d, (%rsi) -; AVX512-NEXT: movl (%rdx), %edx -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: orq %r8, %rcx -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: chain_reset_i256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: movl $-2, %eax +; X64-NEXT: roll %cl, %eax +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $28, %ecx +; X64-NEXT: andl %eax, (%rdi,%rcx) +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq 8(%rdi), %r8 +; X64-NEXT: orq 24(%rdi), %r8 +; X64-NEXT: movq 16(%rdi), %rdi +; X64-NEXT: orq %rcx, %rdi +; X64-NEXT: movl (%rsi), %eax +; X64-NEXT: movl %ecx, (%rsi) +; X64-NEXT: movl (%rdx), %ecx +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: orq %r8, %rdi +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq %rem = and i32 %position, 255 %ofs = zext nneg i32 %rem to i256 %bit = shl nuw i256 1, %ofs From 9e7207eccc43fba6806b1a2b42cf0f60e7337ce8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 5 Nov 2025 08:51:13 +0000 Subject: [PATCH 2/2] Drop extra store-load-store chain splicing --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d2eeb76057418..4d44227b3ecd4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53446,13 +53446,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, Align(), St->getMemOperand()->getFlags()); // If there are other uses of StoredVal, replace with a new load of the - // whole (updated) value and ensure that any chained dependencies on the - // original store are updated to come AFTER the new load. + // whole (updated) value. if (!StoredVal.hasOneUse()) { SDValue NewLoad = DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); DAG.ReplaceAllUsesWith(StoredVal, NewLoad); - DAG.ReplaceAllUsesWith(SDValue(St, 0), NewLoad.getValue(1)); } return NewStore; }