@@ -873,13 +873,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
873873 return DataIdx >= 0 &&
874874 TRI->regsOverlap (MI.getOperand (DataIdx).getReg (), Reg);
875875 };
876+
876877 int WaitStatesNeededForDef =
877878 VALUWaitStates - getWaitStatesSince (IsHazardFn, VALUWaitStates);
878879 WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
879880
880881 return WaitStatesNeeded;
881882}
882883
884+ // / Dest sel forwarding issue occurs if additional logic is needed to swizzle /
885+ // / pack the computed value into correct bit position of the dest register. This
886+ // / occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
887+ // / dst_sel that is not aligned to the register. This function analayzes the \p
888+ // / MI and \returns an operand with dst forwarding issue, or nullptr if
889+ // / none exists.
890+ static const MachineOperand *
891+ getDstSelForwardingOperand (const MachineInstr &MI, const GCNSubtarget &ST) {
892+ if (!SIInstrInfo::isVALU (MI))
893+ return nullptr ;
894+
895+ const SIInstrInfo *TII = ST.getInstrInfo ();
896+
897+ unsigned Opcode = MI.getOpcode ();
898+
899+ // There are three different types of instructions
900+ // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
901+ // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
902+ // CVT_SR_BF8_F32 with op_sel[3:2]
903+ // != 0
904+ if (SIInstrInfo::isSDWA (MI)) {
905+ // Type 1: SDWA with dst_sel != DWORD
906+ if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
907+ if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
908+ return nullptr ;
909+ } else {
910+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
911+ // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
912+ if (!AMDGPU::hasNamedOperand (Opcode, AMDGPU::OpName::op_sel) ||
913+ !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm () &
914+ SISrcMods::DST_OP_SEL ||
915+ (AMDGPU::isFP8DstSelInst (Opcode) &&
916+ (TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm () &
917+ SISrcMods::OP_SEL_0))))
918+ return nullptr ;
919+ }
920+
921+ return TII->getNamedOperand (MI, AMDGPU::OpName::vdst);
922+ }
923+
924+ // / Checks whether the provided \p MI "consumes" the operand with a Dest sel
925+ // / fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
926+ // / RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
927+ static bool consumesDstSelForwardingOperand (const MachineInstr *VALU,
928+ const MachineOperand *Dst,
929+ const SIRegisterInfo *TRI) {
930+ // We must consider implicit reads of the VALU. SDWA with dst_sel and
931+ // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
932+ // and we must account for that hazard.
933+ // We also must account for WAW hazards. In particular, WAW with dest
934+ // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
935+ // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
936+ // check for ECC. Without accounting for this hazard, the ECC will be
937+ // wrong.
938+ // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
939+ // complete zeroesHigh16BitsOfDest)
940+ for (auto &Operand : VALU->operands ()) {
941+ if (Operand.isReg () && TRI->regsOverlap (Dst->getReg (), Operand.getReg ())) {
942+ return true ;
943+ }
944+ }
945+ return false ;
946+ }
947+
883948int GCNHazardRecognizer::checkVALUHazards (MachineInstr *VALU) {
884949 int WaitStatesNeeded = 0 ;
885950
@@ -910,27 +975,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
910975 if (ST.hasDstSelForwardingHazard ()) {
911976 const int Shift16DefWaitstates = 1 ;
912977
913- auto IsShift16BitDefFn = [this , VALU](const MachineInstr &MI) {
914- if (!SIInstrInfo::isVALU (MI))
915- return false ;
916- const SIInstrInfo *TII = ST.getInstrInfo ();
917- if (SIInstrInfo::isSDWA (MI)) {
918- if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
919- if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
920- return false ;
921- } else {
922- if (!AMDGPU::hasNamedOperand (MI.getOpcode (), AMDGPU::OpName::op_sel) ||
923- !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)
924- ->getImm () &
925- SISrcMods::DST_OP_SEL))
926- return false ;
927- }
978+ auto IsShift16BitDefFn = [this , VALU](const MachineInstr &ProducerMI) {
928979 const SIRegisterInfo *TRI = ST.getRegisterInfo ();
929- if (auto *Dst = TII->getNamedOperand (MI, AMDGPU::OpName::vdst)) {
930- Register Def = Dst->getReg ();
980+ const MachineOperand *ForwardedDst =
981+ getDstSelForwardingOperand (ProducerMI, ST);
982+ if (ForwardedDst) {
983+ return consumesDstSelForwardingOperand (VALU, ForwardedDst, TRI);
984+ }
931985
932- for (const MachineOperand &Use : VALU->explicit_uses ()) {
933- if (Use.isReg () && TRI->regsOverlap (Def, Use.getReg ()))
986+ if (ProducerMI.isInlineAsm ()) {
987+ // Assume inline asm has dst forwarding hazard
988+ for (auto &Def : ProducerMI.all_defs ()) {
989+ if (consumesDstSelForwardingOperand (VALU, &Def, TRI))
934990 return true ;
935991 }
936992 }
@@ -1027,7 +1083,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10271083 // problematic thus far.
10281084
10291085 // see checkVALUHazards()
1030- if (!ST.has12DWordStoreHazard ())
1086+ if (!ST.has12DWordStoreHazard () && !ST. hasDstSelForwardingHazard () )
10311087 return 0 ;
10321088
10331089 const MachineRegisterInfo &MRI = MF.getRegInfo ();
@@ -1036,11 +1092,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10361092 for (const MachineOperand &Op :
10371093 llvm::drop_begin (IA->operands (), InlineAsm::MIOp_FirstOperand)) {
10381094 if (Op.isReg () && Op.isDef ()) {
1039- WaitStatesNeeded =
1040- std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1095+ if (!TRI.isVectorRegister (MRI, Op.getReg ()))
1096+ continue ;
1097+
1098+ if (ST.has12DWordStoreHazard ()) {
1099+ WaitStatesNeeded =
1100+ std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1101+ }
10411102 }
10421103 }
10431104
1105+ if (ST.hasDstSelForwardingHazard ()) {
1106+ const int Shift16DefWaitstates = 1 ;
1107+
1108+ auto IsShift16BitDefFn = [this , &IA](const MachineInstr &ProducerMI) {
1109+ const MachineOperand *Dst = getDstSelForwardingOperand (ProducerMI, ST);
1110+ // Assume inline asm reads the dst
1111+ if (Dst)
1112+ return IA->modifiesRegister (Dst->getReg (), &TRI) ||
1113+ IA->readsRegister (Dst->getReg (), &TRI);
1114+
1115+ if (ProducerMI.isInlineAsm ()) {
1116+ // If MI is inline asm, assume it has dst forwarding hazard
1117+ for (auto &Def : ProducerMI.all_defs ()) {
1118+ if (IA->modifiesRegister (Def.getReg (), &TRI) ||
1119+ IA->readsRegister (Def.getReg (), &TRI)) {
1120+ return true ;
1121+ }
1122+ }
1123+ }
1124+
1125+ return false ;
1126+ };
1127+
1128+ int WaitStatesNeededForDef =
1129+ Shift16DefWaitstates -
1130+ getWaitStatesSince (IsShift16BitDefFn, Shift16DefWaitstates);
1131+ WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
1132+ }
1133+
10441134 return WaitStatesNeeded;
10451135}
10461136
0 commit comments