diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3979e1e0c44aa..a116b57c85a88 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -173,6 +173,7 @@ struct FoldCandidate { class SIFoldOperandsImpl { public: + MachineFunction *MF; MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; @@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } MachineOperand *New = Fold.Def.OpToFold; + + // Verify the register is compatible with the operand. + if (const TargetRegisterClass *OpRC = + TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) { + const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg()); + const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg()); + unsigned NewSubReg = New->getSubReg(); + unsigned OldSubReg = Old.getSubReg(); + + const TargetRegisterClass *ConstrainRC = OpRC; + if (NewSubReg && OldSubReg) { + unsigned PreA, PreB; + ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC, + NewSubReg, PreA, PreB); + } else if (OldSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg); + } else if (NewSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg); + } + + if (!ConstrainRC) + return false; + + if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) { + LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI) + << TRI->getRegClassName(ConstrainRC) << '\n'); + return false; + } + } + // Rework once the VS_16 register class is updated to include proper // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) @@ -1429,30 +1460,9 @@ void SIFoldOperandsImpl::foldOperand( return; } - if (!FoldingImmLike) { - if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { - // Don't fold if OpToFold doesn't hold an aligned register. - const TargetRegisterClass *RC = - TRI->getRegClassForReg(*MRI, OpToFold.getReg()); - assert(RC); - if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { - unsigned SubReg = OpToFold.getSubReg(); - if (const TargetRegisterClass *SubRC = - TRI->getSubRegisterClass(RC, SubReg)) - RC = SubRC; - } - - if (!RC || !TRI->isProperlyAlignedRC(*RC)) - return; - } - - tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunities. The shrink operands pass - // already does this. - return; - } + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunities. The shrink operands pass + // already does this. tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); } @@ -2747,6 +2757,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { } bool SIFoldOperandsImpl::run(MachineFunction &MF) { + this->MF = &MF; MRI = &MF.getRegInfo(); ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir index a0ea04b1b9c0f..8326862706a02 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir @@ -31,9 +31,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_64 = IMPLICIT_DEF %2:areg_64_align2 = COPY killed %1 @@ -105,9 +104,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_96 = IMPLICIT_DEF %2:areg_96_align2 = COPY killed %1 @@ -234,9 +232,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_128 = IMPLICIT_DEF %2:areg_128_align2 = COPY killed %1 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir index a54c0accce783..5f9b71c0c2198 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir @@ -46,9 +46,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64 = IMPLICIT_DEF %2:vreg_64_align2 = COPY killed %1 @@ -148,9 +147,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_96 = IMPLICIT_DEF %2:vreg_96_align2 = COPY killed %1 @@ -326,11 +324,59 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_128 = IMPLICIT_DEF %2:vreg_128_align2 = COPY killed %1 GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, implicit $exec ... + +# Make sure the alignment requirement is respected for VS_64 operand +# uses. +--- +name: aligned_vgpr_vs_64_constraint +tracksRegLiveness: true +isSSA: true +body: | + bb.0.entry: + liveins: $vgpr0, $sgpr8_sgpr9 + + ; GFX908-LABEL: name: aligned_vgpr_vs_64_constraint + ; GFX908: liveins: $vgpr0, $sgpr8_sgpr9 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1) + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0 + ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + ; GFX908-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) + ; GFX908-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: aligned_vgpr_vs_64_constraint + ; GFX90A: liveins: $vgpr0, $sgpr8_sgpr9 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1) + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + ; GFX90A-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, killed [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) + ; GFX90A-NEXT: S_ENDPGM 0 + %0:sgpr_64 = COPY $sgpr8_sgpr9 + %1:vgpr_32 = COPY $vgpr0 + %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR %0, %1, 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1) + %3:vgpr_32 = COPY %2.sub0 + %4:vreg_64_align2 = COPY killed %2.sub1_sub2 + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64_align2 = REG_SEQUENCE %3, %subreg.sub0, %5, %subreg.sub1 + %7:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed %6, 0, killed %4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B64_gfx9 %5, killed %7, 0, 0, implicit $exec :: (store (s64), addrspace 3) + S_ENDPGM 0 + +...