@@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
216216 CombineInfo &Paired, bool Modify = false );
217217 static bool widthsFit (const GCNSubtarget &STI, const CombineInfo &CI,
218218 const CombineInfo &Paired);
219- static unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired);
219+ static unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired,
220+ const GCNSubtarget *STI = nullptr );
220221 static std::pair<unsigned , unsigned > getSubRegIdxs (const CombineInfo &CI,
221222 const CombineInfo &Paired);
222223 const TargetRegisterClass *
@@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
343344 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344345 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345346 case AMDGPU::S_LOAD_DWORD_IMM:
347+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
346348 case AMDGPU::GLOBAL_LOAD_DWORD:
347349 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348350 case AMDGPU::GLOBAL_STORE_DWORD:
@@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
353355 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354356 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355357 case AMDGPU::S_LOAD_DWORDX2_IMM:
358+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
356359 case AMDGPU::GLOBAL_LOAD_DWORDX2:
357360 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358361 case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363366 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364367 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365368 case AMDGPU::S_LOAD_DWORDX3_IMM:
369+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
366370 case AMDGPU::GLOBAL_LOAD_DWORDX3:
367371 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368372 case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
373377 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374378 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375379 case AMDGPU::S_LOAD_DWORDX4_IMM:
380+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
376381 case AMDGPU::GLOBAL_LOAD_DWORDX4:
377382 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378383 case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
383388 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384389 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385390 case AMDGPU::S_LOAD_DWORDX8_IMM:
391+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
386392 return 8 ;
387393 case AMDGPU::DS_READ_B32:
388394 case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
507513 case AMDGPU::S_LOAD_DWORDX3_IMM:
508514 case AMDGPU::S_LOAD_DWORDX4_IMM:
509515 case AMDGPU::S_LOAD_DWORDX8_IMM:
516+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
517+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
518+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
519+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
520+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
510521 return S_LOAD_IMM;
511522 case AMDGPU::DS_READ_B32:
512523 case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
591602 case AMDGPU::S_LOAD_DWORDX3_IMM:
592603 case AMDGPU::S_LOAD_DWORDX4_IMM:
593604 case AMDGPU::S_LOAD_DWORDX8_IMM:
605+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
606+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
607+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
608+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
609+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
594610 return AMDGPU::S_LOAD_DWORD_IMM;
595611 case AMDGPU::GLOBAL_LOAD_DWORD:
596612 case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703719 case AMDGPU::S_LOAD_DWORDX3_IMM:
704720 case AMDGPU::S_LOAD_DWORDX4_IMM:
705721 case AMDGPU::S_LOAD_DWORDX8_IMM:
722+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
723+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
724+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
725+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
726+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
706727 Result.SBase = true ;
707728 return Result;
708729 case AMDGPU::DS_READ_B32:
@@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs(
12121233
12131234 // Copy to the old destination registers.
12141235 const MCInstrDesc &CopyDesc = TII->get (TargetOpcode::COPY);
1215- const auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1216- const auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1236+ auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1237+ auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1238+
1239+ // The constrained sload instructions in S_LOAD_IMM class will have
1240+ // `early-clobber` flag in the dst operand. Remove the flag before using the
1241+ // MOs in copies.
1242+ if (Dest0->isEarlyClobber ())
1243+ Dest0->setIsEarlyClobber (false );
1244+
1245+ if (Dest1->isEarlyClobber ())
1246+ Dest1->setIsEarlyClobber (false );
12171247
12181248 BuildMI (*MBB, InsertBefore, DL, CopyDesc)
12191249 .add (*Dest0) // Copy to same destination including flags and sub reg.
@@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
14461476 MachineBasicBlock::iterator InsertBefore) {
14471477 MachineBasicBlock *MBB = CI.I ->getParent ();
14481478 DebugLoc DL = CI.I ->getDebugLoc ();
1449- const unsigned Opcode = getNewOpcode (CI, Paired);
1479+ const unsigned Opcode = getNewOpcode (CI, Paired, STM );
14501480
14511481 const TargetRegisterClass *SuperRC = getTargetRegisterClass (CI, Paired);
14521482
@@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
16581688}
16591689
16601690unsigned SILoadStoreOptimizer::getNewOpcode (const CombineInfo &CI,
1661- const CombineInfo &Paired) {
1691+ const CombineInfo &Paired,
1692+ const GCNSubtarget *STI) {
16621693 const unsigned Width = CI.Width + Paired.Width ;
16631694
16641695 switch (getCommonInstClass (CI, Paired)) {
@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
17011732 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17021733 }
17031734 case S_LOAD_IMM:
1704- switch (Width) {
1705- default :
1706- return 0 ;
1707- case 2 :
1708- return AMDGPU::S_LOAD_DWORDX2_IMM;
1709- case 3 :
1710- return AMDGPU::S_LOAD_DWORDX3_IMM;
1711- case 4 :
1712- return AMDGPU::S_LOAD_DWORDX4_IMM;
1713- case 8 :
1714- return AMDGPU::S_LOAD_DWORDX8_IMM;
1735+ // For targets that support XNACK replay, use the constrained load opcode.
1736+ if (STI && STI->hasXnackReplay ()) {
1737+ switch (Width) {
1738+ default :
1739+ return 0 ;
1740+ case 2 :
1741+ return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
1742+ case 3 :
1743+ return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
1744+ case 4 :
1745+ return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
1746+ case 8 :
1747+ return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
1748+ }
1749+ } else {
1750+ switch (Width) {
1751+ default :
1752+ return 0 ;
1753+ case 2 :
1754+ return AMDGPU::S_LOAD_DWORDX2_IMM;
1755+ case 3 :
1756+ return AMDGPU::S_LOAD_DWORDX3_IMM;
1757+ case 4 :
1758+ return AMDGPU::S_LOAD_DWORDX4_IMM;
1759+ case 8 :
1760+ return AMDGPU::S_LOAD_DWORDX8_IMM;
1761+ }
17151762 }
17161763 case GLOBAL_LOAD:
17171764 switch (Width) {
0 commit comments