@@ -88,6 +88,9 @@ class SIInsertWaits : public MachineFunctionPass {
8888 // / \brief Whether the machine function returns void
8989 bool ReturnsVoid;
9090
91+ // / Whether the VCCZ bit is possibly corrupt
92+ bool VCCZCorrupt;
93+
9194 // / \brief Get increment/decrement amount for this instruction.
9295 Counters getHwCounts (MachineInstr &MI);
9396
@@ -116,14 +119,19 @@ class SIInsertWaits : public MachineFunctionPass {
116119 // / \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
117120 void handleSendMsg (MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
118121
122+ // / Return true if there are LGKM instrucitons that haven't been waited on
123+ // / yet.
124+ bool hasOutstandingLGKM () const ;
125+
119126public:
120127 static char ID;
121128
122129 SIInsertWaits () :
123130 MachineFunctionPass (ID),
124131 TII (nullptr ),
125132 TRI (nullptr ),
126- ExpInstrTypesSeen (0 ) { }
133+ ExpInstrTypesSeen (0 ),
134+ VCCZCorrupt (false ) { }
127135
128136 bool runOnMachineFunction (MachineFunction &MF) override ;
129137
@@ -155,6 +163,13 @@ FunctionPass *llvm::createSIInsertWaitsPass() {
155163const Counters SIInsertWaits::WaitCounts = { { 15 , 7 , 15 } };
156164const Counters SIInsertWaits::ZeroCounts = { { 0 , 0 , 0 } };
157165
166+ static bool readsVCCZ (unsigned Opcode) {
167+ return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCNZ;
168+ }
169+
170+ bool SIInsertWaits::hasOutstandingLGKM () const {
171+ return WaitedOn.Named .LGKM != LastIssued.Named .LGKM ;
172+ }
158173
159174Counters SIInsertWaits::getHwCounts (MachineInstr &MI) {
160175 uint64_t TSFlags = MI.getDesc ().TSFlags ;
@@ -475,6 +490,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
475490 TRI =
476491 static_cast <const SIRegisterInfo *>(MF.getSubtarget ().getRegisterInfo ());
477492
493+ const AMDGPUSubtarget &ST = MF.getSubtarget <AMDGPUSubtarget>();
478494 MRI = &MF.getRegInfo ();
479495
480496 WaitedOn = ZeroCounts;
@@ -493,6 +509,44 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
493509 for (MachineBasicBlock::iterator I = MBB.begin (), E = MBB.end ();
494510 I != E; ++I) {
495511
512+ if (ST.getGeneration () <= AMDGPUSubtarget::SEA_ISLANDS) {
513+ // There is a hardware bug on CI/SI where SMRD instruction may corrupt
514+ // vccz bit, so when we detect that an instruction may read from a
515+ // corrupt vccz bit, we need to:
516+ // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
517+ // complete.
518+ // 2. Restore the correct value of vccz by writing the current value
519+ // of vcc back to vcc.
520+
521+ if (TII->isSMRD (I->getOpcode ())) {
522+ VCCZCorrupt = true ;
523+ } else if (!hasOutstandingLGKM () && I->modifiesRegister (AMDGPU::VCC, TRI)) {
524+ // FIXME: We only care about SMRD instructions here, not LDS or GDS.
525+ // Whenever we store a value in vcc, the correct value of vccz is
526+ // restored.
527+ VCCZCorrupt = false ;
528+ }
529+
530+ // Check if we need to apply the bug work-around
531+ if (readsVCCZ (I->getOpcode ()) && VCCZCorrupt) {
532+ DEBUG (dbgs () << " Inserting vccz bug work-around before: " << *I << ' \n ' );
533+
534+ // Wait on everything, not just LGKM. vccz reads usually come from
535+ // terminators, and we always wait on everything at the end of the
536+ // block, so if we only wait on LGKM here, we might end up with
537+ // another s_waitcnt inserted right after this if there are non-LGKM
538+ // instructions still outstanding.
539+ insertWait (MBB, I, LastIssued);
540+
541+ // Restore the vccz bit. Any time a value is written to vcc, the vcc
542+ // bit is updated, so we can restore the bit by reading the value of
543+ // vcc and then writing it back to the register.
544+ BuildMI (MBB, I, I->getDebugLoc (), TII->get (AMDGPU::S_MOV_B64),
545+ AMDGPU::VCC)
546+ .addReg (AMDGPU::VCC);
547+ }
548+ }
549+
496550 // Wait for everything before a barrier.
497551 if (I->getOpcode () == AMDGPU::S_BARRIER)
498552 Changes |= insertWait (MBB, I, LastIssued);
0 commit comments