@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
565565 bool isVmemAccess (const MachineInstr &MI) const ;
566566 bool generateWaitcntInstBefore (MachineInstr &MI,
567567 WaitcntBrackets &ScoreBrackets,
568- MachineInstr *OldWaitcntInstr,
569- bool FlushVmCnt );
568+ MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
569+ bool FlushXCnt );
570570 bool generateWaitcnt (AMDGPU::Waitcnt Wait,
571571 MachineBasicBlock::instr_iterator It,
572572 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
573- MachineInstr *OldWaitcntInstr);
573+ MachineInstr *OldWaitcntInstr, bool FlushXCnt );
574574 void updateEventWaitcntAfter (MachineInstr &Inst,
575575 WaitcntBrackets *ScoreBrackets);
576576 bool isNextENDPGM (MachineBasicBlock::instr_iterator It,
@@ -1846,7 +1846,8 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
18461846bool SIInsertWaitcnts::generateWaitcntInstBefore (MachineInstr &MI,
18471847 WaitcntBrackets &ScoreBrackets,
18481848 MachineInstr *OldWaitcntInstr,
1849- bool FlushVmCnt) {
1849+ bool FlushVmCnt,
1850+ bool FlushXCnt) {
18501851 setForceEmitWaitcnt ();
18511852
18521853 assert (!MI.isMetaInstruction ());
@@ -2101,18 +2102,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
21012102 Wait.BvhCnt = 0 ;
21022103 }
21032104
2105+ // Conservatively flush the Xcnt Counter at the start of the block.
2106+ if (FlushXCnt) {
2107+ if (ScoreBrackets.hasPendingEvent (SMEM_GROUP) &&
2108+ ScoreBrackets.hasPendingEvent (VMEM_GROUP))
2109+ Wait.XCnt = 0 ;
2110+ }
2111+
21042112 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u )
21052113 Wait.LoadCnt = 0 ;
21062114
21072115 return generateWaitcnt (Wait, MI.getIterator (), *MI.getParent (), ScoreBrackets,
2108- OldWaitcntInstr);
2116+ OldWaitcntInstr, FlushXCnt );
21092117}
21102118
21112119bool SIInsertWaitcnts::generateWaitcnt (AMDGPU::Waitcnt Wait,
21122120 MachineBasicBlock::instr_iterator It,
21132121 MachineBasicBlock &Block,
21142122 WaitcntBrackets &ScoreBrackets,
2115- MachineInstr *OldWaitcntInstr) {
2123+ MachineInstr *OldWaitcntInstr,
2124+ bool FlushXCnt) {
21162125 bool Modified = false ;
21172126
21182127 if (OldWaitcntInstr)
@@ -2141,7 +2150,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21412150 }
21422151
21432152 // XCnt may be already consumed by a load wait.
2144- if (Wait.XCnt != ~0u ) {
2153+ if (Wait.XCnt != ~0u && !FlushXCnt ) {
21452154 if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent (SMEM_GROUP))
21462155 Wait.XCnt = ~0u ;
21472156
@@ -2214,7 +2223,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
22142223
22152224 auto SuccessorIt = std::next (Inst.getIterator ());
22162225 bool Result = generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
2217- /* OldWaitcntInstr=*/ nullptr );
2226+ /* OldWaitcntInstr=*/ nullptr , /* FlushXCnt= */ false );
22182227
22192228 if (Result && NeedsEndPGMCheck && isNextENDPGM (SuccessorIt, &Block)) {
22202229 BuildMI (Block, SuccessorIt, Inst.getDebugLoc (), TII->get (AMDGPU::S_NOP))
@@ -2454,6 +2463,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24542463
24552464 // Walk over the instructions.
24562465 MachineInstr *OldWaitcntInstr = nullptr ;
2466+ bool FirstInstInBlock = true ;
24572467
24582468 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin (),
24592469 E = Block.instr_end ();
@@ -2475,10 +2485,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24752485
24762486 bool FlushVmCnt = Block.getFirstTerminator () == Inst &&
24772487 isPreheaderToFlush (Block, ScoreBrackets);
2488+ bool FlushXCnt = FirstInstInBlock;
2489+ if (FirstInstInBlock)
2490+ FirstInstInBlock = false ;
24782491
24792492 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
24802493 Modified |= generateWaitcntInstBefore (Inst, ScoreBrackets, OldWaitcntInstr,
2481- FlushVmCnt);
2494+ FlushVmCnt, FlushXCnt );
24822495 OldWaitcntInstr = nullptr ;
24832496
24842497 // Restore vccz if it's not known to be correct already.
@@ -2567,7 +2580,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
25672580
25682581 // Combine or remove any redundant waitcnts at the end of the block.
25692582 Modified |= generateWaitcnt (Wait, Block.instr_end (), Block, ScoreBrackets,
2570- OldWaitcntInstr);
2583+ OldWaitcntInstr, /* FlushXcnt= */ false );
25712584
25722585 LLVM_DEBUG ({
25732586 dbgs () << " *** End Block: " ;
0 commit comments