@@ -216,7 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
216216 CombineInfo &Paired, bool Modify = false );
217217 static bool widthsFit (const GCNSubtarget &STI, const CombineInfo &CI,
218218 const CombineInfo &Paired);
219- static unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired);
219+ unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired);
220220 static std::pair<unsigned , unsigned > getSubRegIdxs (const CombineInfo &CI,
221221 const CombineInfo &Paired);
222222 const TargetRegisterClass *
@@ -353,6 +353,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
353353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355355 case AMDGPU::S_LOAD_DWORDX2_IMM:
356+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
356357 case AMDGPU::GLOBAL_LOAD_DWORDX2:
357358 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358359 case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +364,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363364 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364365 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365366 case AMDGPU::S_LOAD_DWORDX3_IMM:
367+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
366368 case AMDGPU::GLOBAL_LOAD_DWORDX3:
367369 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368370 case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +375,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
373375 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375377 case AMDGPU::S_LOAD_DWORDX4_IMM:
378+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
376379 case AMDGPU::GLOBAL_LOAD_DWORDX4:
377380 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378381 case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +386,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
383386 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384387 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385388 case AMDGPU::S_LOAD_DWORDX8_IMM:
389+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
386390 return 8 ;
387391 case AMDGPU::DS_READ_B32:
388392 case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +511,10 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
507511 case AMDGPU::S_LOAD_DWORDX3_IMM:
508512 case AMDGPU::S_LOAD_DWORDX4_IMM:
509513 case AMDGPU::S_LOAD_DWORDX8_IMM:
514+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
510518 return S_LOAD_IMM;
511519 case AMDGPU::DS_READ_B32:
512520 case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +599,10 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
591599 case AMDGPU::S_LOAD_DWORDX3_IMM:
592600 case AMDGPU::S_LOAD_DWORDX4_IMM:
593601 case AMDGPU::S_LOAD_DWORDX8_IMM:
602+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
594606 return AMDGPU::S_LOAD_DWORD_IMM;
595607 case AMDGPU::GLOBAL_LOAD_DWORD:
596608 case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +715,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703715 case AMDGPU::S_LOAD_DWORDX3_IMM:
704716 case AMDGPU::S_LOAD_DWORDX4_IMM:
705717 case AMDGPU::S_LOAD_DWORDX8_IMM:
718+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
706722 Result.SBase = true ;
707723 return Result;
708724 case AMDGPU::DS_READ_B32:
@@ -1212,8 +1228,14 @@ void SILoadStoreOptimizer::copyToDestRegs(
12121228
12131229 // Copy to the old destination registers.
12141230 const MCInstrDesc &CopyDesc = TII->get (TargetOpcode::COPY);
1215- const auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1216- const auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1231+ auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1232+ auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1233+
1234+ // The constrained sload instructions in S_LOAD_IMM class will have
1235+ // `early-clobber` flag in the dst operand. Remove the flag before using the
1236+ // MOs in copies.
1237+ Dest0->setIsEarlyClobber (false );
1238+ Dest1->setIsEarlyClobber (false );
12171239
12181240 BuildMI (*MBB, InsertBefore, DL, CopyDesc)
12191241 .add (*Dest0) // Copy to same destination including flags and sub reg.
@@ -1700,19 +1722,29 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
17001722 case 8 :
17011723 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17021724 }
1703- case S_LOAD_IMM:
1725+ case S_LOAD_IMM: {
1726+ // If XNACK is enabled, use the constrained opcodes when the first load is
1727+ // under-aligned.
1728+ const MachineMemOperand *MMO = *CI.I ->memoperands_begin ();
1729+ bool NeedsConstrainedOpc =
1730+ STM->isXNACKEnabled () && MMO->getAlign ().value () < Width * 4 ;
17041731 switch (Width) {
17051732 default :
17061733 return 0 ;
17071734 case 2 :
1708- return AMDGPU::S_LOAD_DWORDX2_IMM;
1735+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736+ : AMDGPU::S_LOAD_DWORDX2_IMM;
17091737 case 3 :
1710- return AMDGPU::S_LOAD_DWORDX3_IMM;
1738+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739+ : AMDGPU::S_LOAD_DWORDX3_IMM;
17111740 case 4 :
1712- return AMDGPU::S_LOAD_DWORDX4_IMM;
1741+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742+ : AMDGPU::S_LOAD_DWORDX4_IMM;
17131743 case 8 :
1714- return AMDGPU::S_LOAD_DWORDX8_IMM;
1744+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745+ : AMDGPU::S_LOAD_DWORDX8_IMM;
17151746 }
1747+ }
17161748 case GLOBAL_LOAD:
17171749 switch (Width) {
17181750 default :
0 commit comments