-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[X86] narrowBitOpRMW - add handling for single bit insertion patterns (REAPPLIED) #166337
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
… (REAPPLIED) Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns. REAPPLIED from llvm#165742 which was reverted as part of a chain of commits due to a sanitizer regression that should have been fixed by llvm#166160
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesInsertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns. REAPPLIED from #165742 which was reverted as part of a chain of commits due to a sanitizer regression that should have been fixed by #166160 Patch is 43.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166337.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b97b5089cb0a3..d4a4d4339f7e1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53349,7 +53349,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
// Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -53375,14 +53376,20 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
// BTR: X & ~(1 << ShAmt)
// BTS: X | (1 << ShAmt)
// BTC: X ^ (1 << ShAmt)
- SDValue ShAmt;
+ //
+ // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+ SDValue InsertBit, ShAmt;
if (!StoredVal.hasOneUse() ||
!(sd_match(StoredVal, m_And(m_Specific(LoadVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
+ m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(StoredVal,
+ m_Or(m_And(m_Specific(LoadVal),
+ m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53390,6 +53397,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
return SDValue();
+ // If we're inserting a bit then it must be the LSB.
+ if (InsertBit) {
+ KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+ if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+ return SDValue();
+ }
+
// Split the shift into an alignment shift that moves the active i32 block to
// the bottom bits for truncation and a modulo shift that can act on the i32.
EVT AmtVT = ShAmt.getValueType();
@@ -53397,6 +53411,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
DAG.getSignedConstant(-32LL, DL, AmtVT));
SDValue ModuloAmt =
DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+ ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
// Compute the byte offset for the i32 block that is changed by the RMW.
// combineTruncate will adjust the load for us in a similar way.
@@ -53411,13 +53426,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- SDValue Mask =
- DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
- if (StoredVal.getOpcode() == ISD::AND)
- Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
+
+ SDValue Res;
+ if (InsertBit) {
+ SDValue BitMask =
+ DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+ Res =
+ DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+ } else {
+ if (StoredVal.getOpcode() == ISD::AND)
+ Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+ }
- SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
Align(), St->getMemOperand()->getFlags());
}
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index c311ab869c311..87a54a0b9148d 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB9_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: notl %esi
-; X86-NEXT: notl %edx
-; X86-NEXT: je .LBB9_4
-; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB9_4:
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $32, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ebx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: movl %edx, (%ebx)
-; X86-NEXT: movl %esi, 4(%ebx)
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -600,208 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $96, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %ebx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 64(%esp,%eax), %edx
-; X86-NEXT: movl 68(%esp,%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: movl 72(%esp,%esi), %ebx
-; X86-NEXT: movl 76(%esp,%esi), %esi
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %edi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: movl 36(%esp,%ecx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esp,%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl 8(%eax), %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: notl %esi
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl 44(%esp,%eax), %eax
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: andl 12(%ecx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl 32(%esp,%eax), %edx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl (%eax), %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: andl 4(%ecx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl 12(%ebp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: btl %esi, %eax
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %edi, 8(%ecx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: xorl %r9d, %r9d
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %r9, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %r9, %rax
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq 8(%rdi), %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq (%rdi), %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: movl %ecx, %eax
-; SSE-NEXT: andl $96, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: andl $96, %esi
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
-; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: init_eq_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rax, %rsi
-; AVX2-NEXT: movl %edx, %edx
-; AVX2-NEXT: xorl %r8d, %r8d
-; AVX2-NEXT: shldq %cl, %rdx, %r8
-; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shlxq %rcx, %rax, %rax
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rax, %rsi
-; AVX2-NEXT: cmovneq %r9, %rax
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: cmovneq %rdx, %r8
-; AVX2-NEXT: cmovneq %r9, %rdx
-; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: andnq (%rdi), %rax, %r8
-; AVX2-NEXT: orq %rdx, %r8
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $96, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: movl (%rdi,%rax), %eax
-; AVX2-NEXT: btl %ecx, %eax
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: movq %r8, (%rdi)
-; AVX2-NEXT: movq %rsi, 8(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %rax, %rsi
-; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: xorl %r9d, %r9d
-; AVX512-NEXT: shldq %cl, %rdx, %r9
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: cmovneq %r8, %rax
-; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT: cmovneq %rdx, %r9
-; AVX512-NEXT: cmovneq %r8, %rdx
-; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: andnq (%rdi), %rax, %r8
-; AVX512-NEXT: orq %rdx, %r8
-; AVX512-NEXT: movl %ecx, %eax
-; AVX512-NEXT: andl $96, %eax
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: movl (%rdi,%rax), %eax
-; AVX512-NEXT: btl %ecx, %eax
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: movq %r8, (%rdi)
-; AVX512-NEXT: movq %rsi, 8(%rdi)
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andl $96, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -977,673 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $352, %esp # imm = 0x160
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%eax), %edi
-; X86-NEXT: movl 44(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %eax
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9...
[truncated]
|
…unknown offset load/stores llvm#166337 replaces large (illegal type) loads/stores with a smaller i32 load/store based off the demanded shifted bits. As these shifts are non-constant we can't regenerate the PointerInfo data with a fixed offset, so we need to discard the data entirely. Fixes llvm#166744 - post-ra has to reconstruct dependencies after the chains have been stripped and uses pointer info instead - which resulted in some loads being rescheduled earlier than the dependent store as it was thought they didn't alias
…unknown offset load/stores (#166752) #166337 replaces large (illegal type) loads/stores with a smaller i32 load/store based off the demanded shifted bits. As these shifts are non-constant we can't regenerate the PointerInfo data with a fixed offset, so we need to discard the data entirely. Fixes #166744 - post-ra has to reconstruct dependencies after the chains have been stripped and uses pointer info instead - which resulted in some loads being rescheduled earlier than the dependent store as it was thought they didn't alias
Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32).
We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns.
REAPPLIED from #165742 which was reverted as part of a chain of commits due to a sanitizer regression that should have been fixed by #166160