@@ -876,13 +876,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
876876 return DataIdx >= 0 &&
877877 TRI->regsOverlap (MI.getOperand (DataIdx).getReg (), Reg);
878878 };
879+
879880 int WaitStatesNeededForDef =
880881 VALUWaitStates - getWaitStatesSince (IsHazardFn, VALUWaitStates);
881882 WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
882883
883884 return WaitStatesNeeded;
884885}
885886
887+ // / Dest sel forwarding issue occurs if additional logic is needed to swizzle /
888+ // / pack the computed value into correct bit position of the dest register. This
889+ // / occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
890+ // / dst_sel that is not aligned to the register. This function analayzes the \p
891+ // / MI and \returns an operand with dst forwarding issue, or nullptr if
892+ // / none exists.
893+ static const MachineOperand *
894+ getDstSelForwardingOperand (const MachineInstr &MI, const GCNSubtarget &ST) {
895+ if (!SIInstrInfo::isVALU (MI))
896+ return nullptr ;
897+
898+ const SIInstrInfo *TII = ST.getInstrInfo ();
899+
900+ unsigned Opcode = MI.getOpcode ();
901+
902+ // There are three different types of instructions
903+ // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
904+ // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
905+ // CVT_SR_BF8_F32 with op_sel[3:2]
906+ // != 0
907+ if (SIInstrInfo::isSDWA (MI)) {
908+ // Type 1: SDWA with dst_sel != DWORD
909+ if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
910+ if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
911+ return nullptr ;
912+ } else {
913+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
914+ // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
915+ if (!AMDGPU::hasNamedOperand (Opcode, AMDGPU::OpName::op_sel) ||
916+ !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm () &
917+ SISrcMods::DST_OP_SEL ||
918+ (AMDGPU::isFP8DstSelInst (Opcode) &&
919+ (TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm () &
920+ SISrcMods::OP_SEL_0))))
921+ return nullptr ;
922+ }
923+
924+ return TII->getNamedOperand (MI, AMDGPU::OpName::vdst);
925+ }
926+
927+ // / Checks whether the provided \p MI "consumes" the operand with a Dest sel
928+ // / fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
929+ // / RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
930+ static bool consumesDstSelForwardingOperand (const MachineInstr *VALU,
931+ const MachineOperand *Dst,
932+ const SIRegisterInfo *TRI) {
933+ // We must consider implicit reads of the VALU. SDWA with dst_sel and
934+ // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
935+ // and we must account for that hazard.
936+ // We also must account for WAW hazards. In particular, WAW with dest
937+ // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
938+ // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
939+ // check for ECC. Without accounting for this hazard, the ECC will be
940+ // wrong.
941+ // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
942+ // complete zeroesHigh16BitsOfDest)
943+ for (auto &Operand : VALU->operands ()) {
944+ if (Operand.isReg () && TRI->regsOverlap (Dst->getReg (), Operand.getReg ())) {
945+ return true ;
946+ }
947+ }
948+ return false ;
949+ }
950+
886951int GCNHazardRecognizer::checkVALUHazards (MachineInstr *VALU) {
887952 int WaitStatesNeeded = 0 ;
888953
@@ -913,27 +978,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
913978 if (ST.hasDstSelForwardingHazard ()) {
914979 const int Shift16DefWaitstates = 1 ;
915980
916- auto IsShift16BitDefFn = [this , VALU](const MachineInstr &MI) {
917- if (!SIInstrInfo::isVALU (MI))
918- return false ;
919- const SIInstrInfo *TII = ST.getInstrInfo ();
920- if (SIInstrInfo::isSDWA (MI)) {
921- if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
922- if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
923- return false ;
924- } else {
925- if (!AMDGPU::hasNamedOperand (MI.getOpcode (), AMDGPU::OpName::op_sel) ||
926- !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)
927- ->getImm () &
928- SISrcMods::DST_OP_SEL))
929- return false ;
930- }
981+ auto IsShift16BitDefFn = [this , VALU](const MachineInstr &ProducerMI) {
931982 const SIRegisterInfo *TRI = ST.getRegisterInfo ();
932- if (auto *Dst = TII->getNamedOperand (MI, AMDGPU::OpName::vdst)) {
933- Register Def = Dst->getReg ();
983+ const MachineOperand *ForwardedDst =
984+ getDstSelForwardingOperand (ProducerMI, ST);
985+ if (ForwardedDst) {
986+ return consumesDstSelForwardingOperand (VALU, ForwardedDst, TRI);
987+ }
934988
935- for (const MachineOperand &Use : VALU->explicit_uses ()) {
936- if (Use.isReg () && TRI->regsOverlap (Def, Use.getReg ()))
989+ if (ProducerMI.isInlineAsm ()) {
990+ // Assume inline asm has dst forwarding hazard
991+ for (auto &Def : ProducerMI.all_defs ()) {
992+ if (consumesDstSelForwardingOperand (VALU, &Def, TRI))
937993 return true ;
938994 }
939995 }
@@ -1030,7 +1086,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10301086 // problematic thus far.
10311087
10321088 // see checkVALUHazards()
1033- if (!ST.has12DWordStoreHazard ())
1089+ if (!ST.has12DWordStoreHazard () && !ST. hasDstSelForwardingHazard () )
10341090 return 0 ;
10351091
10361092 const MachineRegisterInfo &MRI = MF.getRegInfo ();
@@ -1039,11 +1095,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10391095 for (const MachineOperand &Op :
10401096 llvm::drop_begin (IA->operands (), InlineAsm::MIOp_FirstOperand)) {
10411097 if (Op.isReg () && Op.isDef ()) {
1042- WaitStatesNeeded =
1043- std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1098+ if (!TRI.isVectorRegister (MRI, Op.getReg ()))
1099+ continue ;
1100+
1101+ if (ST.has12DWordStoreHazard ()) {
1102+ WaitStatesNeeded =
1103+ std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1104+ }
10441105 }
10451106 }
10461107
1108+ if (ST.hasDstSelForwardingHazard ()) {
1109+ const int Shift16DefWaitstates = 1 ;
1110+
1111+ auto IsShift16BitDefFn = [this , &IA](const MachineInstr &ProducerMI) {
1112+ const MachineOperand *Dst = getDstSelForwardingOperand (ProducerMI, ST);
1113+ // Assume inline asm reads the dst
1114+ if (Dst)
1115+ return IA->modifiesRegister (Dst->getReg (), &TRI) ||
1116+ IA->readsRegister (Dst->getReg (), &TRI);
1117+
1118+ if (ProducerMI.isInlineAsm ()) {
1119+ // If MI is inline asm, assume it has dst forwarding hazard
1120+ for (auto &Def : ProducerMI.all_defs ()) {
1121+ if (IA->modifiesRegister (Def.getReg (), &TRI) ||
1122+ IA->readsRegister (Def.getReg (), &TRI)) {
1123+ return true ;
1124+ }
1125+ }
1126+ }
1127+
1128+ return false ;
1129+ };
1130+
1131+ int WaitStatesNeededForDef =
1132+ Shift16DefWaitstates -
1133+ getWaitStatesSince (IsShift16BitDefFn, Shift16DefWaitstates);
1134+ WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
1135+ }
1136+
10471137 return WaitStatesNeeded;
10481138}
10491139
0 commit comments