@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
352352 return 1 ;
353353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
356+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
355357 case AMDGPU::S_LOAD_DWORDX2_IMM:
356358 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357359 case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363365 return 2 ;
364366 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365367 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
368+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
369+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
366370 case AMDGPU::S_LOAD_DWORDX3_IMM:
367371 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368372 case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
374378 return 3 ;
375379 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376380 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
381+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
382+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
377383 case AMDGPU::S_LOAD_DWORDX4_IMM:
378384 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379385 case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
385391 return 4 ;
386392 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387393 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
394+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
395+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
388396 case AMDGPU::S_LOAD_DWORDX8_IMM:
389397 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390398 return 8 ;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
499507 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
500508 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501509 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
510+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
511+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
512+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
513+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
502514 return S_BUFFER_LOAD_IMM;
503515 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504516 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505517 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506518 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507519 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
520+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
521+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
522+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
523+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
508524 return S_BUFFER_LOAD_SGPR_IMM;
509525 case AMDGPU::S_LOAD_DWORD_IMM:
510526 case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
587603 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
588604 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589605 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
606+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
607+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
608+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
609+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
590610 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591611 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592612 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593613 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594614 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595615 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
616+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
617+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
618+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
619+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
596620 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597621 case AMDGPU::S_LOAD_DWORD_IMM:
598622 case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,13 +727,21 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703727 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704728 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705729 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
730+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
731+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
732+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
733+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
706734 Result.SOffset = true ;
707735 [[fallthrough]];
708736 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
709737 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710738 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
711739 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712740 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
741+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
742+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
743+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
744+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
713745 case AMDGPU::S_LOAD_DWORD_IMM:
714746 case AMDGPU::S_LOAD_DWORDX2_IMM:
715747 case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
16791711 return New;
16801712}
16811713
1714+ static bool needsConstrainedOpcode (const GCNSubtarget &STM,
1715+ ArrayRef<MachineMemOperand *> MMOs,
1716+ unsigned Width) {
1717+ // Conservatively returns true if not found the MMO.
1718+ return STM.isXNACKEnabled () &&
1719+ (MMOs.size () != 1 || MMOs[0 ]->getAlign ().value () < Width * 4 );
1720+ }
1721+
16821722unsigned SILoadStoreOptimizer::getNewOpcode (const CombineInfo &CI,
16831723 const CombineInfo &Paired) {
16841724 const unsigned Width = CI.Width + Paired.Width ;
@@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
16961736
16971737 case UNKNOWN:
16981738 llvm_unreachable (" Unknown instruction class" );
1699- case S_BUFFER_LOAD_IMM:
1739+ case S_BUFFER_LOAD_IMM: {
1740+ // If XNACK is enabled, use the constrained opcodes when the first load is
1741+ // under-aligned.
1742+ bool NeedsConstrainedOpc =
1743+ needsConstrainedOpcode (*STM, CI.I ->memoperands (), Width);
17001744 switch (Width) {
17011745 default :
17021746 return 0 ;
17031747 case 2 :
1704- return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1748+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1749+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
17051750 case 3 :
1706- return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1751+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1752+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
17071753 case 4 :
1708- return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1754+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1755+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
17091756 case 8 :
1710- return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1757+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1758+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
17111759 }
1712- case S_BUFFER_LOAD_SGPR_IMM:
1760+ }
1761+ case S_BUFFER_LOAD_SGPR_IMM: {
1762+ // If XNACK is enabled, use the constrained opcodes when the first load is
1763+ // under-aligned.
1764+ bool NeedsConstrainedOpc =
1765+ needsConstrainedOpcode (*STM, CI.I ->memoperands (), Width);
17131766 switch (Width) {
17141767 default :
17151768 return 0 ;
17161769 case 2 :
1717- return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1770+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1771+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
17181772 case 3 :
1719- return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1773+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1774+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
17201775 case 4 :
1721- return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1776+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1777+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
17221778 case 8 :
1723- return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1779+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1780+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17241781 }
1782+ }
17251783 case S_LOAD_IMM: {
17261784 // If XNACK is enabled, use the constrained opcodes when the first load is
17271785 // under-aligned.
1728- const MachineMemOperand *MMO = *CI.I ->memoperands_begin ();
17291786 bool NeedsConstrainedOpc =
1730- STM-> isXNACKEnabled () && MMO-> getAlign (). value () < Width * 4 ;
1787+ needsConstrainedOpcode (*STM, CI. I -> memoperands (), Width) ;
17311788 switch (Width) {
17321789 default :
17331790 return 0 ;
0 commit comments