diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 8b1d4ba68a444..aaecfa1e33f06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -14,6 +14,10 @@ /// MFMA opcode. /// /// TODO: +/// - Handle rewrites of phis. This must be more careful than normal about the +/// reassignment. We do not want to introduce an AGPR-to-AGPR copy inside of a +/// loop, so it depends on the exact assignment of the copy. +/// /// - Update LiveIntervals incrementally instead of recomputing from scratch /// //===----------------------------------------------------------------------===// @@ -60,6 +64,25 @@ class AMDGPURewriteAGPRCopyMFMAImpl { return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1; } + /// Find AV_* registers assigned to AGPRs (or virtual registers which were + /// already required to be AGPR). + /// + /// \return the assigned physical register that \p VReg is assigned to if it + /// is an AGPR, otherwise MCRegister(). + MCRegister getAssignedAGPR(Register VReg) const { + MCRegister PhysReg = VRM.getPhys(VReg); + if (!PhysReg) + return MCRegister(); + + // If this is an AV register, we have to check if the actual assignment is + // to an AGPR + const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + return TRI.isAGPRClass(AssignedRC) ? PhysReg : MCRegister(); + } + + bool tryReassigningMFMAChain(MachineInstr &MFMA, Register MFMAHintReg, + MCPhysReg PhysRegHint) const; + /// Compute the register class constraints based on the uses of \p Reg, /// excluding MFMA uses from which can be rewritten to change the register /// class constraint. This should be nearly identical to @@ -74,6 +97,8 @@ class AMDGPURewriteAGPRCopyMFMAImpl { Register Reg, SmallVectorImpl &RewriteCandidates, SmallSetVector &RewriteRegs) const; + bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const; + bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const; bool run(MachineFunction &MF) const; }; @@ -154,6 +179,87 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( return true; } +bool AMDGPURewriteAGPRCopyMFMAImpl::tryReassigningMFMAChain( + MachineInstr &MFMA, Register MFMAHintReg, MCPhysReg PhysRegHint) const { + // src2 and dst have the same physical class constraint; try to preserve + // the original src2 subclass if one were to exist. + SmallVector RewriteCandidates = {&MFMA}; + SmallSetVector RewriteRegs; + + // Make sure we reassign the MFMA we found the copy from first. We want + // to ensure dst ends up in the physreg we were originally copying to. + RewriteRegs.insert(MFMAHintReg); + + // We've found av = COPY (MFMA) (or MFMA (v = COPY av)) and need to verify + // that we can trivially rewrite src2 to use the new AGPR. If we can't + // trivially replace it, we're going to induce as many copies as we would have + // emitted in the first place, as well as need to assign another register, and + // need to figure out where to put them. The live range splitting is smarter + // than anything we're doing here, so trust it did something reasonable. + // + // Note recomputeRegClassExceptRewritable will consider the constraints of + // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. + if (!recomputeRegClassExceptRewritable(MFMAHintReg, RewriteCandidates, + RewriteRegs)) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg " + << printReg(MFMAHintReg, &TRI) << '\n'); + return false; + } + + // If src2 and dst are different registers, we need to also reassign the + // input to an available AGPR if it is compatible with all other uses. + // + // If we can't reassign it, we'd need to introduce a different copy + // which is likely worse than the copy we'd be saving. + // + // It's likely that the MFMA is used in sequence with other MFMAs; if we + // cannot migrate the full use/def chain of MFMAs, we would need to + // introduce intermediate copies somewhere. So we only make the + // transform if all the interfering MFMAs can also be migrated. Collect + // the set of rewritable MFMAs and check if we can assign an AGPR at + // that point. + // + // If any of the MFMAs aren't reassignable, we give up and rollback to + // the original register assignments. + + using RecoloringStack = + SmallVector, 8>; + RecoloringStack TentativeReassignments; + + for (Register RewriteReg : RewriteRegs) { + LiveInterval &LI = LIS.getInterval(RewriteReg); + TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)}); + LRM.unassign(LI); + } + + if (!attemptReassignmentsToAGPR(RewriteRegs, PhysRegHint)) { + // Roll back the register assignments to the original state. + for (auto [LI, OldAssign] : TentativeReassignments) { + if (VRM.hasPhys(LI->reg())) + LRM.unassign(*LI); + LRM.assign(*LI, OldAssign); + } + + return false; + } + + // Fixup the register classes of the virtual registers now that we've + // committed to the reassignments. + for (Register InterferingReg : RewriteRegs) { + const TargetRegisterClass *EquivalentAGPRRegClass = + TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); + MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); + } + + for (MachineInstr *RewriteCandidate : RewriteCandidates) { + int NewMFMAOp = + AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); + RewriteCandidate->setDesc(TII.get(NewMFMAOp)); + } + + return true; +} + /// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a /// preference to use \p PhysReg first. Returns false if the reassignments /// cannot be trivially performed. @@ -206,6 +312,77 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR( return true; } +/// Identify copies that look like: +/// %vdst:vgpr = V_MFMA_.. %src0:av, %src1:av, %src2:vgpr +/// %agpr = COPY %vgpr +/// +/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR +/// versions of the MFMA. This should cover the common case. +bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesToAGPR( + Register VReg, MCRegister AssignedAGPR) const { + bool MadeChange = false; + for (MachineInstr &UseMI : MRI.def_instructions(VReg)) { + if (!UseMI.isCopy()) + continue; + + Register CopySrcReg = UseMI.getOperand(1).getReg(); + if (!CopySrcReg.isVirtual()) + continue; + + // TODO: Handle loop phis copied to AGPR. e.g. + // + // loop: + // %phi:vgpr = COPY %mfma:vgpr + // %mfma:vgpr = V_MFMA_xxx_vgprcd_e64 %a, %b, %phi + // s_cbranch_vccnz loop + // + // endloop: + // %agpr = mfma + // + // We need to be sure that %phi is assigned to the same physical register as + // %mfma, or else we will just be moving copies into the loop. + + for (MachineInstr &CopySrcDefMI : MRI.def_instructions(CopySrcReg)) { + if (isRewriteCandidate(CopySrcDefMI) && + tryReassigningMFMAChain( + CopySrcDefMI, CopySrcDefMI.getOperand(0).getReg(), AssignedAGPR)) + MadeChange = true; + } + } + + return MadeChange; +} + +/// Identify copies that look like: +/// %src:vgpr = COPY %src:agpr +/// %vdst:vgpr = V_MFMA_... %src0:av, %src1:av, %src:vgpr +/// +/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR +/// versions of the MFMA. This should cover rarer cases, and will generally be +/// redundant with tryFoldCopiesToAGPR. +bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR( + Register VReg, MCRegister AssignedAGPR) const { + bool MadeChange = false; + for (MachineInstr &UseMI : MRI.use_instructions(VReg)) { + if (!UseMI.isCopy()) + continue; + + Register CopyDstReg = UseMI.getOperand(0).getReg(); + if (!CopyDstReg.isVirtual()) + continue; + + for (MachineInstr &CopyUseMI : MRI.use_instructions(CopyDstReg)) { + if (isRewriteCandidate(CopyUseMI)) { + if (tryReassigningMFMAChain(CopyUseMI, CopyDstReg, + VRM.getPhys(CopyDstReg))) + MadeChange = true; + } + } + } + + return MadeChange; +} + bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // This only applies on subtargets that have a configurable AGPR vs. VGPR // allocation. @@ -222,124 +399,14 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VReg = Register::index2VirtReg(I); - Register PhysReg = VRM.getPhys(VReg); - if (!PhysReg) + MCRegister AssignedAGPR = getAssignedAGPR(VReg); + if (!AssignedAGPR) continue; - // Find AV_* registers assigned to AGPRs. - const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); - if (!TRI.hasAGPRs(VirtRegRC)) - continue; - - const TargetRegisterClass *AssignedRC = VirtRegRC; - if (TRI.hasVGPRs(VirtRegRC)) { - // If this is an AV register, we have to check if the actual assignment is - // to an AGPR - AssignedRC = TRI.getPhysRegBaseClass(PhysReg); - if (!TRI.isAGPRClass(AssignedRC)) - continue; - } - - LiveInterval &LI = LIS.getInterval(VReg); - - for (VNInfo *VNI : LI.vnis()) { - if (VNI->isPHIDef() || VNI->isUnused()) - continue; - - MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); - if (!DefMI || !DefMI->isCopy()) - continue; - - Register MFMADstReg = DefMI->getOperand(1).getReg(); - if (!MFMADstReg.isVirtual()) - continue; - - LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg); - LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot()); - MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def); - if (!MFMA || !isRewriteCandidate(*MFMA)) - continue; - - // src2 and dst have the same physical class constraint; try to preserve - // the original src2 subclass if one were to exist. - SmallVector RewriteCandidates = {MFMA}; - SmallSetVector RewriteRegs; - - // Make sure we reassign the MFMA we found the copy from first. We want - // to ensure dst ends up in the physreg we were originally copying to. - RewriteRegs.insert(MFMADstReg); - - // We've found av = COPY (MFMA), and need to verify that we can trivially - // rewrite src2 to use the new AGPR. If we can't trivially replace it, - // we're going to induce as many copies as we would have emitted in the - // first place, as well as need to assign another register, and need to - // figure out where to put them. The live range splitting is smarter than - // anything we're doing here, so trust it did something reasonable. - // - // Note recomputeRegClassExceptRewritable will consider the constraints of - // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. - if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates, - RewriteRegs)) { - LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg " - << printReg(MFMADstReg, &TRI) << '\n'); - continue; - } - - // If src2 and dst are different registers, we need to also reassign the - // input to an available AGPR if it is compatible with all other uses. - // - // If we can't reassign it, we'd need to introduce a different copy - // which is likely worse than the copy we'd be saving. - // - // It's likely that the MFMA is used in sequence with other MFMAs; if we - // cannot migrate the full use/def chain of MFMAs, we would need to - // introduce intermediate copies somewhere. So we only make the - // transform if all the interfering MFMAs can also be migrated. Collect - // the set of rewritable MFMAs and check if we can assign an AGPR at - // that point. - // - // If any of the MFMAs aren't reassignable, we give up and rollback to - // the original register assignments. - - using RecoloringStack = - SmallVector, 8>; - RecoloringStack TentativeReassignments; - - for (Register RewriteReg : RewriteRegs) { - LiveInterval &LI = LIS.getInterval(RewriteReg); - TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)}); - LRM.unassign(LI); - } - - if (!attemptReassignmentsToAGPR(RewriteRegs, PhysReg)) { - // Roll back the register assignments to the original state. - for (auto [LI, OldAssign] : TentativeReassignments) { - if (VRM.hasPhys(LI->reg())) - LRM.unassign(*LI); - LRM.assign(*LI, OldAssign); - } - - continue; - } - - // Fixup the register classes of the virtual registers now that we've - // committed to the reassignments. - for (Register InterferingReg : RewriteRegs) { - const TargetRegisterClass *EquivalentAGPRRegClass = - TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); - MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); - } - - for (MachineInstr *RewriteCandidate : RewriteCandidates) { - int NewMFMAOp = - AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); - RewriteCandidate->setDesc(TII.get(NewMFMAOp)); - } - - // We likely left an identity copy behind after assignment; let - // VirtRegRewriter deal with it later. + if (tryFoldCopiesToAGPR(VReg, AssignedAGPR)) + MadeChange = true; + if (tryFoldCopiesFromAGPR(VReg, AssignedAGPR)) MadeChange = true; - } } return MadeChange; diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir index 6dcfda3117234..1c5e0e362e359 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir @@ -69,9 +69,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -97,8 +97,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -126,10 +126,10 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1 - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1 + ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -200,62 +200,3 @@ body: | GLOBAL_STORE_DWORDX4 %0, %4, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... - -# Degenerate case. Copy from AGPR to VGPR is dead undef subreg def ---- -name: test_rewrite_mfma_copy_from_agpr_undef_vdst_subreg_use_imm_src2 -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - - ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_undef_vdst_subreg_use_imm_src2 - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: dead [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: SI_RETURN - %0:vreg_64_align2 = COPY $vgpr4_vgpr5 - %1:av_64_align2 = COPY $vgpr0_vgpr1 - %2:av_64_align2 = COPY $vgpr2_vgpr3 - %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) - %4:vreg_128_align2 = COPY %3 - undef %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, 0, 0, 0, 0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %4, 0, 0, implicit $exec :: (store (s128), addrspace 1) - SI_RETURN -... - -# Degenerate case. Copy from AGPR to VGPR is dead, but same register -# is redefined as whole register. ---- -name: test_rewrite_mfma_copy_from_agpr_to_vdst_def_imm_src2 -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - - ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_to_vdst_def_imm_src2 - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: dead [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: SI_RETURN - %0:vreg_64_align2 = COPY $vgpr4_vgpr5 - %1:av_64_align2 = COPY $vgpr0_vgpr1 - %2:av_64_align2 = COPY $vgpr2_vgpr3 - %3:areg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1) - %4:vreg_64_align2 = COPY %3 - %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, 0, 0, 0, 0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORDX2 %0, %4, 0, 0, implicit $exec :: (store (s64), addrspace 1) - SI_RETURN -... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index e22775eecca24..72f6cb32d1b66 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -305,14 +305,14 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub0_sub1:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub0_sub1 - ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2_sub3 - ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub1_sub2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub1_sub2 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) + ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub0_sub1:areg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1 + ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 + ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 81613f69c982b..343a5c8511ee9 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -598,9 +598,11 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_write_b32 a29, v61 ; CHECK-NEXT: v_accvgpr_write_b32 a30, v62 ; CHECK-NEXT: v_accvgpr_write_b32 a31, v63 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 ; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000 +; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 +; CHECK-NEXT: v_accvgpr_read_b32 v32, a32 ; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32 +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31] ; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 @@ -611,9 +613,12 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 @@ -637,19 +642,10 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 ; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 ; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 ; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 ; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 ; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 ; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 ; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 @@ -678,58 +674,26 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(p ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v34, 4.0 -; CHECK-NEXT: v_accvgpr_read_b32 v33, a31 -; CHECK-NEXT: v_accvgpr_read_b32 v32, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v1, v34, v[2:33] +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31] ; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000 ; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v1, v34, v[2:33] +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 -; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 -; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80 -; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64 -; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) @@ -749,15 +713,12 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64(double %arg0, d ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_and_b32_e32 v8, 0x3ff, v31 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v8 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] ; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() @@ -776,18 +737,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a0 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[10:11] +; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] ; CHECK-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3] -; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[4:5], v[6:7], v[0:1] +; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1] ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() @@ -807,32 +765,16 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg( ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v18, 4.0 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 v[2:17], v1, v18, v[2:17] +; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v18, a[0:15] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; CHECK-NEXT: s_endpgm %def = call <32 x float> asm sideeffect "; def $0", "=a"() %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> @@ -851,32 +793,32 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_ ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v18, 4.0 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a1 +; CHECK-NEXT: v_accvgpr_mov_b32 a17, a16 +; CHECK-NEXT: v_accvgpr_mov_b32 a16, a15 +; CHECK-NEXT: v_accvgpr_mov_b32 a15, a14 +; CHECK-NEXT: v_accvgpr_mov_b32 a14, a13 +; CHECK-NEXT: v_accvgpr_mov_b32 a13, a12 +; CHECK-NEXT: v_accvgpr_mov_b32 a12, a11 +; CHECK-NEXT: v_accvgpr_mov_b32 a11, a10 +; CHECK-NEXT: v_accvgpr_mov_b32 a10, a9 +; CHECK-NEXT: v_accvgpr_mov_b32 a9, a8 +; CHECK-NEXT: v_accvgpr_mov_b32 a8, a7 +; CHECK-NEXT: v_accvgpr_mov_b32 a7, a6 +; CHECK-NEXT: v_accvgpr_mov_b32 a6, a5 +; CHECK-NEXT: v_accvgpr_mov_b32 a5, a4 +; CHECK-NEXT: v_accvgpr_mov_b32 a4, a3 +; CHECK-NEXT: v_accvgpr_mov_b32 a3, a2 +; CHECK-NEXT: v_accvgpr_mov_b32 a2, a1 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 v[2:17], v1, v18, v[2:17] +; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v18, a[2:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48 -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] ; CHECK-NEXT: s_endpgm %def = call <32 x float> asm sideeffect "; def $0", "=a"() %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll new file mode 100644 index 0000000000000..122d46b39ff32 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -0,0 +1,454 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck %s + +; After reassigning the MFMA to use AGPRs, we've alleviated enough +; register pressure to try eliminating the spill of %spill with the freed +; up VGPR. +define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ptr addrspace(1) inreg %ptr) #0 { +; CHECK-LABEL: eliminate_spill_after_mfma_rewrite: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a3, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[32:63], v[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 +; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a60, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a59, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a58, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a57, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a56, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a55, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a54, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a53, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a52, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a51, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a50, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a49, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a48, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a47, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a46, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a45, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a44, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a43, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a42, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a37, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a36, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %x, i32 %y, <4 x i32> %arg, i32 0, i32 0, i32 0) + %v = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0, $1", "=v,=v"() + %v0 = extractvalue { <32 x i32>, <32 x i32> } %v, 0 + %v1 = extractvalue { <32 x i32>, <32 x i32> } %v, 1 + %spill = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai) + %a0 = call <32 x i32> asm sideeffect "; def $0", "=a"() + %a1 = call <32 x i32> asm sideeffect "; def $0", "=a"() + store volatile <32 x i32> %v0, ptr addrspace(1) %ptr + store volatile <32 x i32> %v1, ptr addrspace(1) %ptr + store volatile <4 x i32> %spill, ptr addrspace(1) %ptr + ret void +} + +; Same, except we fold out 2 spills from %spill0 and %spill1 +define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %arg, ptr addrspace(1) inreg %ptr) #0 { +; CHECK-LABEL: eliminate_spill_after_mfma_rewrite_x2: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a3, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[32:63], v[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 +; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a60, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a59, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a58, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a57, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a56, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a55, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a54, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a53, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a52, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a51, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a50, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a49, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a48, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a47, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a46, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a45, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a44, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a43, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a42, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a37, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a36, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %x, i32 %y, <4 x i32> %arg, i32 0, i32 0, i32 0) + %v = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0, $1", "=v,=v"() + %v0 = extractvalue { <32 x i32>, <32 x i32> } %v, 0 + %v1 = extractvalue { <32 x i32>, <32 x i32> } %v, 1 + %spill0 = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai) + %spill1 = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai) + %a0 = call <32 x i32> asm sideeffect "; def $0", "=a"() + %a1 = call <32 x i32> asm sideeffect "; def $0", "=a"() + store volatile <32 x i32> %v0, ptr addrspace(1) %ptr + store volatile <32 x i32> %v1, ptr addrspace(1) %ptr + store volatile <4 x i32> %spill0, ptr addrspace(1) %ptr + store volatile <4 x i32> %spill1, ptr addrspace(1) %ptr + ret void +} + +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) #1 + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }