Skip to content

Commit 9ee9fb0

Browse files
authored
[X86] narrowBitOpRMW - add handling for single bit insertion patterns (REAPPLIED) (#166337)
Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns. REAPPLIED from #165742 which was reverted as part of a chain of commits due to a sanitizer regression that should have been fixed by #166160
1 parent bb4ed55 commit 9ee9fb0

File tree

2 files changed

+114
-881
lines changed

2 files changed

+114
-881
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53349,7 +53349,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
5334953349
}
5335053350

5335153351
// Look for a RMW operation that only touches one bit of a larger than legal
53352-
// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
53352+
// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
53353+
// i32 sub value.
5335353354
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5335453355
SelectionDAG &DAG,
5335553356
const X86Subtarget &Subtarget) {
@@ -53375,28 +53376,42 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5337553376
// BTR: X & ~(1 << ShAmt)
5337653377
// BTS: X | (1 << ShAmt)
5337753378
// BTC: X ^ (1 << ShAmt)
53378-
SDValue ShAmt;
53379+
//
53380+
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
53381+
SDValue InsertBit, ShAmt;
5337953382
if (!StoredVal.hasOneUse() ||
5338053383
!(sd_match(StoredVal, m_And(m_Specific(LoadVal),
5338153384
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
5338253385
sd_match(StoredVal,
5338353386
m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
5338453387
sd_match(StoredVal,
53385-
m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
53388+
m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53389+
sd_match(StoredVal,
53390+
m_Or(m_And(m_Specific(LoadVal),
53391+
m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
53392+
m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
5338653393
return SDValue();
5338753394

5338853395
// Ensure the shift amount is in bounds.
5338953396
KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
5339053397
if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
5339153398
return SDValue();
5339253399

53400+
// If we're inserting a bit then it must be the LSB.
53401+
if (InsertBit) {
53402+
KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
53403+
if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
53404+
return SDValue();
53405+
}
53406+
5339353407
// Split the shift into an alignment shift that moves the active i32 block to
5339453408
// the bottom bits for truncation and a modulo shift that can act on the i32.
5339553409
EVT AmtVT = ShAmt.getValueType();
5339653410
SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
5339753411
DAG.getSignedConstant(-32LL, DL, AmtVT));
5339853412
SDValue ModuloAmt =
5339953413
DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
53414+
ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
5340053415

5340153416
// Compute the byte offset for the i32 block that is changed by the RMW.
5340253417
// combineTruncate will adjust the load for us in a similar way.
@@ -53411,13 +53426,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5341153426
SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
5341253427
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
5341353428

53414-
SDValue Mask =
53415-
DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
53416-
DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
53417-
if (StoredVal.getOpcode() == ISD::AND)
53418-
Mask = DAG.getNOT(DL, Mask, MVT::i32);
53429+
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
53430+
DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
53431+
53432+
SDValue Res;
53433+
if (InsertBit) {
53434+
SDValue BitMask =
53435+
DAG.getNode(ISD::SHL, DL, MVT::i32,
53436+
DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
53437+
Res =
53438+
DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
53439+
Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
53440+
} else {
53441+
if (StoredVal.getOpcode() == ISD::AND)
53442+
Mask = DAG.getNOT(DL, Mask, MVT::i32);
53443+
Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
53444+
}
5341953445

53420-
SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
5342153446
return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
5342253447
Align(), St->getMemOperand()->getFlags());
5342353448
}

0 commit comments

Comments
 (0)