@@ -1204,6 +1204,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
12041204 fixGetRegWaitIdle (MI);
12051205 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug ())
12061206 fixDsAtomicAsyncBarrierArriveB64 (MI);
1207+ if (ST.hasScratchBaseForwardingHazard ())
1208+ fixScratchBaseForwardingHazard (MI);
12071209}
12081210
12091211static bool isVCmpXWritesExec (const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -3468,3 +3470,79 @@ bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
34683470
34693471 return true ;
34703472}
3473+
3474+ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard (MachineInstr *MI) {
3475+ // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3476+ // for hazard to trigger.
3477+ if (!IsHazardRecognizerMode)
3478+ return false ;
3479+
3480+ const SIRegisterInfo *TRI = ST.getRegisterInfo ();
3481+ const SIInstrInfo *TII = ST.getInstrInfo ();
3482+ // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3483+ const int FlatScrBaseWaitStates = 10 ;
3484+
3485+ bool ReadsFlatScrLo =
3486+ MI->readsRegister (AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3487+ bool ReadsFlatScrHi =
3488+ MI->readsRegister (AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3489+ if (isSGetReg (MI->getOpcode ())) {
3490+ switch (getHWReg (TII, *MI)) {
3491+ default :
3492+ break ;
3493+ case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
3494+ ReadsFlatScrLo = true ;
3495+ break ;
3496+ case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
3497+ ReadsFlatScrHi = true ;
3498+ break ;
3499+ }
3500+ }
3501+
3502+ const MachineRegisterInfo &MRI = MF.getRegInfo ();
3503+
3504+ auto IsRegDefHazard = [&](Register Reg) -> bool {
3505+ DenseSet<const MachineBasicBlock *> Visited;
3506+ auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3507+ return MI.modifiesRegister (Reg, TRI);
3508+ };
3509+
3510+ // This literally abuses the idea of waitstates. Instead of waitstates it
3511+ // returns 1 for SGPR written and 0 otherwise.
3512+ auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3513+ if (!TII->isSALU (MI) && !TII->isVALU (MI))
3514+ return 0 ;
3515+ for (const MachineOperand &MO : MI.all_defs ()) {
3516+ if (TRI->isSGPRReg (MRI, MO.getReg ()))
3517+ return 1 ;
3518+ }
3519+ return 0 ;
3520+ };
3521+
3522+ auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3523+ if (MI.getOpcode () == AMDGPU::S_WAITCNT_DEPCTR) {
3524+ unsigned Wait = MI.getOperand (0 ).getImm ();
3525+ if (AMDGPU::DepCtr::decodeFieldSaSdst (Wait) == 0 &&
3526+ AMDGPU::DepCtr::decodeFieldVaSdst (Wait) == 0 )
3527+ return true ;
3528+ }
3529+ return SgprWrites >= FlatScrBaseWaitStates;
3530+ };
3531+
3532+ return ::getWaitStatesSince (
3533+ IsHazardFn, MI->getParent (), std::next (MI->getReverseIterator ()),
3534+ 0 , IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3535+ };
3536+
3537+ if ((!ReadsFlatScrLo || MRI.isConstantPhysReg (AMDGPU::SGPR102) ||
3538+ !IsRegDefHazard (AMDGPU::SGPR102)) &&
3539+ (!ReadsFlatScrHi || MRI.isConstantPhysReg (AMDGPU::SGPR103) ||
3540+ !IsRegDefHazard (AMDGPU::SGPR103)))
3541+ return false ;
3542+
3543+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
3544+ TII->get (AMDGPU::S_WAITCNT_DEPCTR))
3545+ .addImm (AMDGPU::DepCtr::encodeFieldVaSdst (
3546+ AMDGPU::DepCtr::encodeFieldSaSdst (0 ), 0 ));
3547+ return true ;
3548+ }
0 commit comments